├── .pycodestylerc
├── docs
    ├── images
    │   ├── dingtalk.png
    │   ├── architecture.png
    │   ├── performance.png
    │   └── wide-and-deep.png
    ├── requirements.txt
    ├── index.md
    ├── architecture.md
    ├── tutorial
    │   └── ranking
    │   │   ├── __init__.py
    │   │   ├── taobao
    │   │       └── data
    │   │       │   └── stats.py
    │   │   ├── optimization.py
    │   │   └── criteo
    │   │       └── data
    │   │           └── prep.py
    ├── introduction.md
    └── conf.py
├── .clang-format
├── .isort.cfg
├── .github
    ├── helm
    │   ├── Chart.yaml
    │   ├── values.yaml
    │   ├── .helmignore
    │   ├── upload
    │   └── templates
    │   │   └── tfjob.yaml
    ├── ISSUE_TEMPLATE
    │   ├── 30-other.md
    │   ├── 00-enhancement.md
    │   ├── 20-documentation.md
    │   └── 10-bug.md
    └── workflows
    │   ├── cpu.yaml
    │   ├── gpu.yaml
    │   ├── cpu-nightly.yaml
    │   └── gpu-nightly.yaml
├── .gitignore
├── .readthedocs.yaml
├── pyproject.toml
├── ADOPTERS.md
├── NOTICE
├── hybridbackend
    ├── common
    │   ├── __init__.py
    │   ├── profiler.h
    │   ├── macros.h
    │   ├── logging.h
    │   ├── Makefile
    │   ├── logging.cc
    │   ├── profiler.cc
    │   ├── env.h
    │   ├── atomic.cu.h
    │   ├── murmur3.cu.h
    │   └── arrow.h
    ├── tensorflow
    │   ├── benchmarks
    │   │   ├── __init__.py
    │   │   └── data_benchmark_csv.py
    │   ├── distribute
    │   │   ├── nccl
    │   │   │   ├── __init__.py
    │   │   │   └── nccl_get_id.cc
    │   │   ├── partition
    │   │   │   ├── __init__.py
    │   │   │   ├── modulo_functors.h
    │   │   │   └── dual_modulo_functors.h
    │   │   ├── ops.py
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   └── broadcast_test.py
    │   ├── data
    │   │   ├── prefetch
    │   │   │   └── __init__.py
    │   │   ├── rebatch
    │   │   │   ├── __init__.py
    │   │   │   └── dataset.py
    │   │   ├── tabular
    │   │   │   ├── __init__.py
    │   │   │   ├── orc.h
    │   │   │   ├── parquet.h
    │   │   │   ├── table.cc
    │   │   │   └── dataset.py
    │   │   ├── sync
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset.py
    │   │   │   └── utils.py
    │   │   ├── deduplicate
    │   │   │   ├── __init__.py
    │   │   │   └── dataset.py
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── rebatch_dataset_seq_test.py
    │   │   │   ├── parquet_dataset_reshape_test.py
    │   │   │   ├── parquet_dataset_ragged_nested_test.py
    │   │   │   └── sync_replicas_dataset_test.py
    │   ├── framework
    │   │   ├── __init__.py
    │   │   ├── version.py
    │   │   ├── device.py
    │   │   └── config.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── pywrap.py
    │   │   ├── eigen.h
    │   │   ├── dataset.h
    │   │   ├── slice_sum.h
    │   │   ├── fusion_helper.cu.h
    │   │   ├── cast.h
    │   │   ├── stream.h
    │   │   └── fusion_helper.cu.cc
    │   ├── keras
    │   │   ├── __init__.py
    │   │   └── layers
    │   │   │   └── __init__.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   └── gauc.py
    │   ├── pipeline
    │   │   └── __init__.py
    │   ├── embedding
    │   │   ├── __init__.py
    │   │   ├── lookup_functors.h
    │   │   ├── deeprecev.py
    │   │   └── tests
    │   │   │   └── deeprecev_test.py
    │   ├── estimator
    │   │   └── __init__.py
    │   ├── wraps.py
    │   ├── graph
    │   │   ├── common
    │   │   │   ├── linearization.h
    │   │   │   ├── pruning.h
    │   │   │   ├── replacing.h
    │   │   │   ├── relocation.h
    │   │   │   ├── helper.h
    │   │   │   ├── rewriting.h
    │   │   │   ├── packing.h
    │   │   │   └── linearization.cc
    │   │   ├── optimize_lookup.cc
    │   │   ├── op_optimization.h
    │   │   ├── optimize_memory.cc
    │   │   ├── optimize_floormod_shuffle.cc
    │   │   └── optimize_partition_by_modulo.cc
    │   ├── ops
    │   │   ├── __init__.py
    │   │   └── transfer
    │   │   │   └── functors.h
    │   ├── __init__.py
    │   ├── training
    │   │   ├── __init__.py
    │   │   ├── server.py
    │   │   └── variables.py
    │   └── Makefile
    ├── torch
    │   └── __init__.py
    └── __init__.py
├── CONTRIBUTING.md
├── CITATION.cff
└── ROADMAP.md


/.pycodestylerc:
--------------------------------------------------------------------------------
1 | [pycodestyle]
2 | ignore = E501,E722,W503
3 | 


--------------------------------------------------------------------------------
/docs/images/dingtalk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/dingtalk.png


--------------------------------------------------------------------------------
/docs/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/architecture.png


--------------------------------------------------------------------------------
/docs/images/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/performance.png


--------------------------------------------------------------------------------
/docs/images/wide-and-deep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepRec-AI/HybridBackend/HEAD/docs/images/wide-and-deep.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | myst-parser
4 | docutils==0.16
5 | hybridbackend-tf115-cpu
6 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | # clang-format -i --style=google <file>
2 | 
3 | BasedOnStyle: Google
4 | DerivePointerAlignment: false
5 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Contents
 2 | 
 3 | ```{toctree}
 4 | :maxdepth: 2
 5 | 
 6 | introduction
 7 | architecture
 8 | data
 9 | distributed
10 | ```
11 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile=google
3 | indent=2
4 | src_paths=hybridbackend
5 | extra_standard_library=six
6 | known_third_party=tensorflow,torch
7 | known_first_party=hybridbackend
8 | 


--------------------------------------------------------------------------------
/.github/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: hybridbackend-developer
3 | description: A Helm chart for HybridBackend developers
4 | type: application
5 | version: 0.1.0
6 | appVersion: "0.1.0"
7 | 


--------------------------------------------------------------------------------
/.github/helm/values.yaml:
--------------------------------------------------------------------------------
1 | image: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04
2 | port: 20000
3 | gpus: 2
4 | caps: ["SYS_ADMIN", "SYS_PTRACE"]
5 | build: ""
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/30-other.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Other Issues
3 | about: Use this template for any other non-support related issues
4 | 
5 | ---
6 | This template is for miscellaneous issues not covered by the other issue categories.
7 | 
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *.so.*
 4 | *.o
 5 | *.d
 6 | *.log
 7 | build/lib.*
 8 | build/bdist.*
 9 | build/temp.*
10 | build/reports/
11 | build/doc/
12 | build/wheel/
13 | build/release/
14 | *.egg-info/
15 | __pycache__/
16 | .pylint.d/
17 | cache/
18 | outputs/
19 | *_test.py.xml
20 | *_lib.c
21 | .config.mk
22 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # Architecture
2 | 
3 | HybridBackend follows a shares-nothing architecture: A HybridBackend job
4 | consists of single-GPU workers. Workers shares nothing and coordinates by
5 | collective communication. Each worker reads environment variable `TF_CONFIG`
6 | for cluster information.
7 | 
8 | ![architecture](images/architecture.png)
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/00-enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New feature or request
 3 | about: Use this template for raising a feature request.
 4 | 
 5 | ---
 6 | # User Story
 7 | 
 8 | As a _, I want to _, so that _.
 9 | 
10 | # Detailed requirements
11 | 
12 | - It should be _
13 | 
14 | # API Compatibility
15 | 
16 | # Willing to contribute
17 | 
18 | Yes
19 | 
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/20-documentation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Needs help for installation or documentation
 3 | about: Use this template for an installation/documentation issue.
 4 | 
 5 | ---
 6 | 
 7 | # Summary
 8 | 
 9 | # Installation environment
10 | - GPU model and memory:
11 | - OS Platform:
12 | - Docker version:
13 | - GCC/CUDA/cuDNN version:
14 | - Python/conda version:
15 | - TensorFlow/PyTorch version:
16 | 
17 | # Willing to contribute
18 | 
19 | Yes
20 | 
21 | 


--------------------------------------------------------------------------------
/.github/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Optionally declare the Python requirements required to build your docs
13 | python:
14 |   version: "3.6"
15 |   install:
16 |     - requirements: docs/requirements.txt
17 | 
18 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | select = ["F", "E", "W"]
 3 | ignore = ["E501", "E722"]
 4 | ignore-init-module-imports = true
 5 | line-length = 80
 6 | 
 7 | [tool.ruff.per-file-ignores]
 8 | "__init__.py" = ["F401", "F403"]
 9 | 
10 | [tool.ruff.isort]
11 | force-single-line = true
12 | force-sort-within-sections = true
13 | single-line-exclusions = ["typing"]
14 | order-by-type = false
15 | known-third-party = ["tensorflow", "torch"]
16 | known-first-party = ["hybridbackend"]
17 | extra-standard-library = ["six"]
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/10-bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Something is not working
 3 | about: Use this template for reporting a bug or a performance issue.
 4 | 
 5 | ---
 6 | # Current behavior
 7 | 
 8 | # Expected behavior
 9 | 
10 | # System information
11 | - GPU model and memory:
12 | - OS Platform:
13 | - Docker version:
14 | - GCC/CUDA/cuDNN version:
15 | - Python/conda version:
16 | - TensorFlow/PyTorch version:
17 | 
18 | # Code to reproduce
19 | 
20 | ```python
21 | ```
22 | 
23 | # Willing to contribute
24 | 
25 | Yes
26 | 
27 | 


--------------------------------------------------------------------------------
/ADOPTERS.md:
--------------------------------------------------------------------------------
 1 | # List of Adopters
 2 | 
 3 | The following are the adopters of HybridBackend. If you are using HybridBackend
 4 | in your organization, please feel free to add the organization name into the
 5 | following list by a pull request.
 6 | 
 7 | | Organization | Phase |
 8 | | ------------ | ----- |
 9 | |              |       |
10 | 
11 | ## Appendix
12 | 
13 | ### Phases of Adoption
14 | 
15 | | Phase Name | Description |
16 | | ---------- | ----------- |
17 | | **Evaluation** | Interested in HybridBackend |
18 | | **Testing** | Take HybridBackend as one of candidates |
19 | | **Staging** | Decide to use HybridBackend, |
20 | | **Production** | Already put HybridBackend into production |
21 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | HybridBackend
 2 | -------------------------------------------------------------------------------
 3 | 
 4 | Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 5 | 
 6 | This product includes software developed by Alibaba Group Holding Limited.
 7 | 
 8 | This product includes software developed by
 9 | The Apache Software Foundation (http://www.apache.org/).
10 | 
11 | This product includes software from Apache Arrow, which includes the following
12 | in its NOTICE file:
13 | 
14 |   Apache Arrow
15 |   Copyright 2016-2019 The Apache Software Foundation
16 |   
17 |   This product includes software developed at
18 |   The Apache Software Foundation (http://www.apache.org/).
19 | 
20 | 


--------------------------------------------------------------------------------
/hybridbackend/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Common utilities.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/docs/tutorial/ranking/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Sample ranking examples.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Benchmarks for hybridbackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/nccl/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''NCCL related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/prefetch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Prefetching related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/rebatch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Rebatching related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tabular/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Tabular data related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/framework/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Framework releated functions in hybridbackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/sync/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''SyncReplicasDataset related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/torch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''HybridBackend for PyTorch.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | # TODO Add pytorch support.
24 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/partition/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Partitioning related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/deduplicate/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''SyncReplicasDataset related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Common utilities.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from hybridbackend.tensorflow.common.pywrap import oplib
24 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/keras/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''HybridBackend Keras related modules.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from . import layers
24 | from .model import Model
25 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We appreciate all contributions to improve HybridBackend. You can create an
 4 | [issue](https://github.com/alibaba/HybridBackend/issues) or send a
 5 | [pull request](https://github.com/alibaba/HybridBackend/pulls).
 6 | 
 7 | **Working on your first Pull Request?** You can learn how from this *free*
 8 | series [How to Contribute to an Open Source Project on GitHub](https://kcd.im/pull-request)
 9 | 
10 | ## Code style
11 | 
12 | Before any commits, please use below tools to format and check code style:
13 | 
14 | ```bash
15 | build/run build/format
16 | build/run build/lint
17 | ```
18 | 
19 | Commit message style should follow below format:
20 | 
21 | ```text
22 | [Module] Do something great.
23 | ```
24 | 
25 | `Module` could be `CI`, `IO` or other well-known abbreviations.
26 | 
27 | ## Building and testing
28 | 
29 | Test your commit using default developer docker:
30 | 
31 | ```bash
32 | build/run make -j8
33 | build/run make test
34 | ```
35 | 
36 | Also, CI builds would be triggered if a commit is pushed.
37 | 


--------------------------------------------------------------------------------
/hybridbackend/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''HybridBackend entry file.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | __version__ = '1.0.0'
24 | __author__ = 'Alibaba Group Holding Limited'
25 | __copyright__ = '2021 Alibaba Group Holding Limited'
26 | 


--------------------------------------------------------------------------------
/.github/helm/upload:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # =============================================================================
16 | 
17 | set -eo pipefail
18 | 
19 | PODNAME=$1
20 | 
21 | mkdir -p /tmp
22 | tar -czf /tmp/archive.tar.gz .
23 | kubectl wait --for=condition=ready pod ${PODNAME}
24 | kubectl cp /tmp/archive.tar.gz ${PODNAME}:/workspace/archive.tar.gz
25 | kubectl exec -it ${PODNAME} -- tar -xzf /workspace/archive.tar.gz -C /workspace
26 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Metrics for evaluating models in hybridbackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from hybridbackend.tensorflow.metrics.accuracy import accuracy
24 | from hybridbackend.tensorflow.metrics.auc import auc
25 | from hybridbackend.tensorflow.metrics.gauc import gauc
26 | 


--------------------------------------------------------------------------------
/docs/introduction.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | ## Recommendation models
 4 | 
 5 | Model-based recommendation systems play key roles in internet industry, from
 6 | social network to e-commerce platform. Recommendation models are getting
 7 | deeper in recent years, which makes training on GPUs a good choice.
 8 | 
 9 | However, industrial-scale recommendation models are not only deeper, but also
10 | much wider. Training wide-and-deep recommendation models on GPUs with real-world
11 | datasets still suffers from low utilization and high cost.
12 | 
13 | ![wide-and-deep](images/wide-and-deep.png)
14 | 
15 | ## HybridBackend
16 | 
17 | HybridBackend is a high-performance framework for training wide-and-deep
18 | recommendation model on heterogeneous cluster.
19 | 
20 | HybridBackend provides following features:
21 | 
22 | - Memory-efficient loading of categorical data
23 | 
24 | - GPU-efficient orchestration of embedding layers
25 | 
26 | - Communication-efficient training and evaluation at scale
27 | 
28 | - Easy to use with existing AI workflows
29 | 
30 | HybridBackend speeds up training of wide-and-deep recommendation models
31 | dramatically:
32 | 
33 | ![performance](images/performance.png)
34 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Pipeline related classes and functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from hybridbackend.tensorflow.framework.context import Context as _ctx
24 | from hybridbackend.tensorflow.pipeline.pipeline_lib import compute_pipeline
25 | 
26 | _ = (
27 |   _ctx.get().options
28 |   .register(
29 |     'pipeline_dense_ga_enabled', False, env='HB_PIPELINE_DENSE_GA_ENABLED'))
30 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/pywrap.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Python wrapper of tensorflow ops.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.framework.load_library import load_op_library as _load
24 | from tensorflow.python.platform import resource_loader as _loader
25 | 
26 | try:
27 |   oplib = _load(
28 |     _loader.get_path_to_datafile('../libhybridbackend_tensorflow.so'))
29 | except ImportError:
30 |   oplib = None
31 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/embedding/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Support for various embedding backends.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | # pylint: disable=ungrouped-imports
24 | try:
25 |   from .deeprecev import \
26 |     ShardedEmbeddingWeightsRewritingForDeepRecEV as _patch_ev
27 |   from .variables import \
28 |     ShardedEmbeddingWeightsRewritingForVariables as _patch_var
29 | except:  # pylint: disable=bare-except
30 |   pass
31 | # pylint: enable=ungrouped-imports
32 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/estimator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Support for Estimators in hybridbackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | try:
24 |   from tensorflow_estimator.python.estimator.model_fn import EstimatorSpec
25 | except ImportError:
26 |   from tensorflow.python.estimator.model_fn import EstimatorSpec
27 | 
28 | from hybridbackend.tensorflow.estimator.estimator import Estimator
29 | from hybridbackend.tensorflow.estimator.estimator import RunConfig
30 | from hybridbackend.tensorflow.estimator.estimator import train_and_evaluate
31 | 


--------------------------------------------------------------------------------
/hybridbackend/common/profiler.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_PROFILER_H_
17 | #define HYBRIDBACKEND_COMMON_PROFILER_H_
18 | 
19 | #include <string>
20 | 
21 | #if HYBRIDBACKEND_NVTX
22 | #include <nvToolsExt.h>
23 | #endif
24 | 
25 | namespace hybridbackend {
26 | 
27 | class ProfilerRange {
28 |  public:
29 |   static ProfilerRange* forSynch(const std::string& message);
30 |   static ProfilerRange* forLookup(const std::string& message);
31 | 
32 |   ProfilerRange(const std::string& domain, const std::string& message);
33 |   ~ProfilerRange();
34 | 
35 |  private:
36 | #if HYBRIDBACKEND_NVTX
37 |   nvtxDomainHandle_t domain_;
38 |   nvtxRangeId_t range_;
39 | #endif
40 | };
41 | 
42 | }  // namespace hybridbackend
43 | 
44 | #endif  // HYBRIDBACKEND_COMMON_PROFILER_H_
45 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Collective constants.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | 
24 | class CollectiveOps(object):  # pylint: disable=useless-object-inheritance
25 |   r'''Collective operations.
26 |   '''
27 |   SUM = 0
28 |   PROD = 1
29 |   MAX = 2
30 |   MIN = 3
31 |   AVG = 4
32 | 
33 | 
34 | class Topology(object):  # pylint: disable=useless-object-inheritance
35 |   r'''Communication topology.
36 |   '''
37 |   ALL = 0  # Communication across all GPUs
38 |   INTRA_NODE = 1  # Communication across all GPUs in current node
39 |   INTER_NODE = 2  # Communication across all GPUS with same rank in every nodes
40 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/wraps.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Decorator to wraps customized object.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.estimator import estimator
24 | from tensorflow.python.training import optimizer
25 | 
26 | from hybridbackend.tensorflow.estimator.estimator import wraps_estimator
27 | from hybridbackend.tensorflow.training.optimizer import wraps_optimizer
28 | 
29 | 
30 | def wraps(cls):
31 |   r'''Wraps object to be used in HybridBackend.
32 |   '''
33 |   if issubclass(cls, optimizer.Optimizer):
34 |     return wraps_optimizer(cls)
35 |   if issubclass(cls, estimator.Estimator):
36 |     return wraps_estimator(cls)
37 |   return cls
38 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/eigen.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_
18 | 
19 | #include <tensorflow/core/framework/tensor.h>
20 | #include <tensorflow/core/lib/core/errors.h>
21 | #include <tensorflow/core/public/version.h>
22 | #include <third_party/eigen3/unsupported/Eigen/CXX11/Tensor>
23 | 
24 | // NOTE: EIGEN_MAX_ALIGN_BYTES is 64 in TF 1.x. See:
25 | // https://github.com/tensorflow/tensorflow/blob/v1.15.5/third_party/eigen.BUILD#L67
26 | #if EIGEN_MAX_ALIGN_BYTES == 0
27 | #define CHECK_EIGEN_ALIGN(...) (true)
28 | #else
29 | #define CHECK_EIGEN_ALIGN(...) \
30 |   (0 == reinterpret_cast<intptr_t>(__VA_ARGS__) % EIGEN_MAX_ALIGN_BYTES)
31 | #endif
32 | 
33 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_EIGEN_H_
34 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.1.0
 2 | title: HybridBackend
 3 | doi: 10.5281/zenodo.6464188
 4 | type: software
 5 | url: "https://github.com/alibaba/HybridBackend"
 6 | authors:
 7 |   - given-names: Man
 8 |     family-names: Yuan
 9 |   - given-names: Langshi
10 |     family-names: Chen
11 | message: >-
12 |   Please cite HybridBackend in your publications if it helps
13 | preferred-citation:
14 |   title: "PICASSO: Unleashing the Potential of GPU-centric Training for Wide-and-deep Recommender Systems"
15 |   type: conference-paper
16 |   collection-title: "2022 IEEE 38th International Conference on Data Engineering (ICDE)"
17 |   year: 2022
18 |   authors:
19 |   - family-names: "Zhang"
20 |     given-names: "Yuanxing"
21 |   - family-names: "Chen"
22 |     given-names: "Langshi"
23 |   - family-names: "Yang"
24 |     given-names: "Siran"
25 |   - family-names: "Yuan"
26 |     given-names: "Man"
27 |   - family-names: "Yi"
28 |     given-names: "Huimin"
29 |   - family-names: "Zhang"
30 |     given-names: "Jie"
31 |   - family-names: "Wang"
32 |     given-names: "Jiamang"
33 |   - family-names: "Dong"
34 |     given-names: "Jianbo"
35 |   - family-names: "Xu"
36 |     given-names: "Yunlong"
37 |   - family-names: "Song"
38 |     given-names: "Yue"
39 |   - family-names: "Li"
40 |     given-names: "Yong"
41 |   - family-names: "Zhang"
42 |     given-names: "Di"
43 |   - family-names: "Lin"
44 |     given-names: "Wei"
45 |   - family-names: "Qu"
46 |     given-names: "Lin"
47 |   - family-names: "Zheng"
48 |     given-names: "Bo"
49 | 


--------------------------------------------------------------------------------
/hybridbackend/common/macros.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_MACROS_H_
17 | #define HYBRIDBACKEND_COMMON_MACROS_H_
18 | 
19 | #ifdef __has_builtin
20 | #define HB_HAS_BUILTIN(x) __has_builtin(x)
21 | #else
22 | #define HB_HAS_BUILTIN(x) 0
23 | #endif
24 | 
25 | #if (!defined(__NVCC__)) && \
26 |     (HB_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
27 | #define HB_PREDICT_FALSE(x) (__builtin_expect(x, 0))
28 | #define HB_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
29 | #else
30 | #define HB_PREDICT_FALSE(x) (x)
31 | #define HB_PREDICT_TRUE(x) (x)
32 | #endif
33 | 
34 | #define HB_DISALLOW_COPY_AND_ASSIGN(TypeName) \
35 |   TypeName(const TypeName&) = delete;         \
36 |   void operator=(const TypeName&) = delete
37 | 
38 | #endif  // HYBRIDBACKEND_COMMON_MACROS_H_
39 | 


--------------------------------------------------------------------------------
/hybridbackend/common/logging.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_LOGGING_H_
17 | #define HYBRIDBACKEND_COMMON_LOGGING_H_
18 | 
19 | #include <sstream>
20 | 
21 | #include "hybridbackend/common/macros.h"
22 | 
23 | #define HB_LOG_IS_ON(lvl) ((lvl) <= ::hybridbackend::MinLogLevel())
24 | 
25 | #define HB_LOG(lvl)                        \
26 |   if (HB_PREDICT_FALSE(HB_LOG_IS_ON(lvl))) \
27 |   ::hybridbackend::LogMessage(__FILE__, __LINE__)
28 | 
29 | namespace hybridbackend {
30 | 
31 | int& MinLogLevel();
32 | 
33 | class LogMessage : public std::basic_ostringstream<char> {
34 |  public:
35 |   LogMessage(const char* fname, int line);
36 |   ~LogMessage();
37 | 
38 |  private:
39 |   const char* fname_;
40 |   int line_;
41 | };
42 | 
43 | }  // namespace hybridbackend
44 | 
45 | #endif  // HYBRIDBACKEND_COMMON_LOGGING_H_
46 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Communicators and distribution options.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from hybridbackend.tensorflow.distribute.collective import *
24 | from hybridbackend.tensorflow.distribute.ops import CollectiveOps as ops
25 | from hybridbackend.tensorflow.distribute.partition.ops import *
26 | from hybridbackend.tensorflow.framework.context import Context as _ctx
27 | 
28 | _ = (
29 |   _ctx.get().options
30 |   .register('comm_default', 'NCCL', env='HB_COMM_DEFAULT')
31 |   .register('comm_pool_name', 'default')
32 |   .register('comm_pool_capacity', 1)
33 |   .register('comm_wire_dtype', None)
34 |   .register('comm_gradient_wire_dtype', None))
35 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/linearization.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <map>
22 | #include <string>
23 | #include <vector>
24 | 
25 | #include <tensorflow/core/graph/graph.h>
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | class LinearizeOutputs {
31 |  public:
32 |   LinearizeOutputs(const string& op_type, const int32& op_output);
33 |   Status In(Graph* graph);
34 | 
35 |  private:
36 |   string op_type_;
37 |   int32 op_output_;
38 | 
39 |   TF_DISALLOW_COPY_AND_ASSIGN(LinearizeOutputs);
40 | };
41 | 
42 | }  // namespace hybridbackend
43 | }  // namespace tensorflow
44 | 
45 | #endif  // HYBRIDBACKEND_TENSORFLOW
46 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_LINEARIZATION_H_
47 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/pruning.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include <tensorflow/core/graph/graph.h>
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | Status InputPruneN(Graph* graph, const string& target_op_type,
30 |                    const string& target_n_attr, const int& target_n_input,
31 |                    const std::vector<string>& op_types,
32 |                    const std::vector<int>& src_outputs,
33 |                    const std::vector<int>& dst_inputs);
34 | 
35 | }  // namespace hybridbackend
36 | }  // namespace tensorflow
37 | 
38 | #endif  // HYBRIDBACKEND_TENSORFLOW
39 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PRUNING_H_
40 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | # HybridBackend Roadmap
 2 | 
 3 | ## HybridBackend v0.6 (2022-05)
 4 | 
 5 | Objective: "Communication-efficient training and evaluation at scale"
 6 | 
 7 | - Data-Parallel Training and Evaluation
 8 |   - Bucketized Gradients Aggregation using AllReduce
 9 |   - Global Metric Operations
10 |   - Out-Of-Range Coordination
11 | 
12 | - Hybrid-Parallel Embedding Learning
13 |   - Bucketized Embedding Exchanging using AllToAllv
14 |   - Fusion and Quantization of AllToAllv
15 |   - Fusion of Partitioning and Stitching
16 | 
17 | Objective: "Easy to use with existing AI workflows"
18 | 
19 | - Usability
20 |   - Support of MonitoredSession and Estimator
21 |   - Declarative API for Model Definition
22 | 
23 | - Compatibility
24 |   - Support of NVIDIA TensorFlow and DeepRec
25 | 
26 | - Interoperability
27 |   - Inference Pipeline Needs No Change
28 |   - Support of SavedModel
29 |   - Support of Variable, XDL HashTable and PAI Embedding Variable
30 | 
31 | ## HybridBackend v0.5 (2021-11)
32 | 
33 | Objective: "Memory-efficient loading of categorical data"
34 | 
35 | - Parquet Dataset
36 |   - Reading batch of tensors from numeric fields in zero-copy way
37 |   - Reading batch of sparse tensors from numeric list fields in zero-copy way
38 |   - Support of string fields
39 |   - Support of local filesystem, HDFS, S3 and OSS
40 | 
41 | - Data Pipeline Functions
42 |   - Resizing batch of tensors and ragged tensors
43 |   - Converting ragged tensors to sparse tensors
44 | 
45 | Objective: "Easy to use with existing AI workflows"
46 | 
47 | - Compatibility
48 |   - Support of TensorFlow 1.15 and Tensorflow 1.14
49 |   - GitHub actions for uploading wheels to PyPI
50 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/framework/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Version related utilities.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import distutils.version
24 | 
25 | _TENSORFLOW_VERSION = None
26 | 
27 | 
28 | def tf_version():
29 |   r'''Get tensorflow version.
30 |   '''
31 |   global _TENSORFLOW_VERSION
32 |   if _TENSORFLOW_VERSION:
33 |     return _TENSORFLOW_VERSION
34 |   try:
35 |     import tensorflow as tf  # pylint: disable=import-outside-toplevel
36 |     _TENSORFLOW_VERSION = distutils.version.LooseVersion(tf.VERSION)
37 |   except ImportError as imp:
38 |     _TENSORFLOW_VERSION = None
39 |     raise ImportError('Tensorflow version is not supported') from imp
40 |   return _TENSORFLOW_VERSION
41 | 
42 | 
43 | def tf_version_check(ver):
44 |   r'''Whether tensorflow version is greater than ver.
45 |   '''
46 |   return tf_version() >= distutils.version.LooseVersion(ver)
47 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/optimize_lookup.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <vector>
19 | 
20 | #include "hybridbackend/common/env.h"
21 | #include "hybridbackend/tensorflow/graph/common/linearization.h"
22 | #include "hybridbackend/tensorflow/graph/common/packing.h"
23 | #include "hybridbackend/tensorflow/graph/common/rewriting.h"
24 | #include "hybridbackend/tensorflow/graph/op_optimization.h"
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | class OptimizeLookupReplacingPass : public OpOptimizationPass {
30 |  public:
31 |   Status Optimize(Graph* graph, const SessionOptions* options,
32 |                   const bool disabled) override {
33 |     TF_RETURN_IF_ERROR(Rewrite("Lookup", "HbLookup").In(graph));
34 | 
35 |     return Status::OK();
36 |   }
37 | };
38 | 
39 | REGISTER_REPLACING_OPTIMIZATION(OptimizeLookupReplacingPass);
40 | 
41 | }  // namespace hybridbackend
42 | }  // namespace tensorflow
43 | 
44 | #endif  // HYBRIDBACKEND_TENSORFLOW
45 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/dataset.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Do not report compilation warnings of tensorflow dataset implementation.
17 | #pragma GCC system_header
18 | 
19 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_
20 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_
21 | 
22 | #if HYBRIDBACKEND_TENSORFLOW
23 | 
24 | #include <tensorflow/core/framework/dataset.h>
25 | #include <tensorflow/core/lib/io/buffered_inputstream.h>
26 | #include <tensorflow/core/lib/io/inputbuffer.h>
27 | #include <tensorflow/core/lib/io/random_inputstream.h>
28 | #include <tensorflow/core/public/version.h>
29 | 
30 | #if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1015L
31 | #define PARSE_SCALAR ParseScalarArgument
32 | #define PARSE_VECTOR ParseVectorArgument
33 | #else
34 | #define PARSE_SCALAR ::tensorflow::data::ParseScalarArgument
35 | #define PARSE_VECTOR ::tensorflow::data::ParseVectorArgument
36 | #endif
37 | 
38 | #endif  // HYBRIDBACKEND_TENSORFLOW
39 | 
40 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_DATASET_H_
41 | 


--------------------------------------------------------------------------------
/.github/helm/templates/tfjob.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1
 2 | kind: TFJob
 3 | metadata:
 4 |   name: {{ .Release.Name }}
 5 | spec:
 6 |   tfReplicaSpecs:
 7 |     Chief:
 8 |       replicas: 1
 9 |       restartPolicy: Never
10 |       template:
11 |         spec:
12 |           hostIPC: true
13 |           hostPID: true
14 |           containers:
15 |           - name: tensorflow
16 |             image: {{ .Values.image }}
17 |             imagePullPolicy: Always
18 |             resources:
19 |               requests:
20 |                 nvidia.com/gpu: {{ .Values.gpus }}
21 |               limits:
22 |                 nvidia.com/gpu: {{ .Values.gpus }}
23 |             securityContext:
24 |               capabilities:
25 |                 add:
26 |                   {{- range .Values.caps }}
27 |                   - {{ . }}
28 |                   {{- end }}
29 |             args:
30 |             - bash
31 |             - -c
32 |             - tail -f /dev/null
33 |             workingDir: /workspace
34 |             env:
35 |             - name: PYTHONPATH
36 |               value: "$PYTHONPATH:/workspace"
37 |             - name: MALLOC_CONF
38 |               value: "background_thread:true,metadata_thp:auto"
39 |             - name: ARROW_NUM_THREADS
40 |               value: "8"
41 |             - name: S3_ADDRESSING_STYLE
42 |               value: "virtual"
43 |             - name: HYBRIDBACKEND_WHEEL_BUILD
44 |               value: "{{ .Values.build }}"
45 |             ports:
46 |             - containerPort: {{ .Values.port }}
47 |               name: tfjob-port
48 |             {{- range untilStep (int (add .Values.port 1)) (int (add .Values.port (add .Values.gpus 1))) 1}}
49 |             - containerPort: {{ . }}
50 |               name: {{ $portName := (printf "gpu-port-%d" .) }}{{ $portName }}
51 |             {{- end }}
52 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/framework/device.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Utilities for device placement.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.framework import device as pydev
24 | 
25 | from hybridbackend.tensorflow.framework.context import Context
26 | 
27 | 
28 | def device_function(op):
29 |   r'''Device function for HybridBackend.
30 | 
31 |   Args:
32 |     op: Operator to place.
33 | 
34 |   Returns:
35 |     device_string: device placement.
36 |   '''
37 |   ctx = Context.get()
38 |   current_device = pydev.DeviceSpec.from_string(op.device or '')
39 |   if ctx.has_gpu:
40 |     local_device = '/gpu:0'
41 |   else:
42 |     local_device = '/cpu:0'
43 |   worker_device = pydev.DeviceSpec.from_string(
44 |     f'/job:{ctx.task_type}/task:{ctx.task_id}{local_device}')
45 |   if hasattr(worker_device, 'merge_from'):
46 |     worker_device.merge_from(current_device)
47 |   else:
48 |     worker_device = worker_device.make_merged_spec(current_device)
49 |   return worker_device.to_string()
50 | 


--------------------------------------------------------------------------------
/hybridbackend/common/Makefile:
--------------------------------------------------------------------------------
 1 | COMMON_SRC := hybridbackend/common/
 2 | 
 3 | COMMON_CFLAGS := \
 4 | 	$(shell $(PYTHON) -m pybind11 --includes)
 5 | 
 6 | COMMON_CC_SOURCES := $(shell \
 7 | 	find $(COMMON_SRC) -type f \
 8 | 	\( -name "*.cc" ! -name "*.cu*" \) \
 9 | 	-exec realpath {} --relative-to . \;)
10 | 
11 | COMMON_OBJS := $(COMMON_CC_SOURCES:.cc=.o)
12 | $(COMMON_OBJS): %.o:%.cc $(THIRDPARTY_DEPS)
13 | 	mkdir -p $(dir $@)
14 | 	$(CXX) $(CFLAGS) $(CXX_CFLAGS) $(COMMON_CFLAGS) \
15 | 	-MMD -MP -MF $<.d -o $@ -c $< -fpic
16 | 
17 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON)
18 | COMMON_CU_SOURCES := $(shell \
19 | 	find $(COMMON_SRC) -type f \
20 | 	\( -name '*.cu.cc' \) \
21 | 	-exec realpath {} --relative-to . \;)
22 | 
23 | COMMON_CU_OBJS := $(COMMON_CU_SOURCES:.cc=.o)
24 | ifeq ($(OS),Darwin)
25 | $(COMMON_CU_OBJS): %.o:%.cc
26 | 	mkdir -p $(dir $@)
27 | 	$(NVCC) $(NVCC_CFLAGS) \
28 | 		-o $@ -c $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \
29 | 		-Xcompiler -fPIC
30 | else
31 | $(COMMON_CU_OBJS): %.o:%.cc
32 | 	mkdir -p $(dir $@)
33 | 	@$(NVCC) -M $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \
34 | 	 | grep -v '/usr/' \
35 | 	 | sed 's|$(notdir $@)|$@|g' \
36 | 	 | sed 's|\./||g' \
37 | 	 > $<.d
38 | 	$(NVCC) $(NVCC_CFLAGS) \
39 | 		-o $@ -c $< $(CFLAGS) $(COMMON_CFLAGS) -x cu \
40 | 		-Xcompiler -fPIC
41 | endif
42 | COMMON_ALL_OBJS := $(COMMON_OBJS) $(COMMON_CU_OBJS)
43 | else
44 | COMMON_ALL_OBJS := $(COMMON_OBJS)
45 | endif
46 | 
47 | ifeq ($(OS),Darwin)
48 | $(COMMON_LIB): $(COMMON_ALL_OBJS)
49 | 	mkdir -p $(dir $@)
50 | 	$(CXX) $(CFLAGS) $(COMMON_CFLAGS) -std=c++11 \
51 | 	-install_name @rpath/lib$(LIBNAME).so \
52 | 	-framework CoreFoundation \
53 | 	-o $@ $^ $(LDFLAGS) $(COMMON_LDFLAGS)
54 | else
55 | $(COMMON_LIB): $(COMMON_ALL_OBJS)
56 | 	mkdir -p $(dir $@)
57 | 	$(CXX) $(CFLAGS) $(COMMON_CFLAGS) -std=c++11 \
58 | 	-o $@ $^ $(LDFLAGS) $(COMMON_LDFLAGS)
59 | endif
60 | 


--------------------------------------------------------------------------------
/hybridbackend/common/logging.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "hybridbackend/common/logging.h"
17 | 
18 | #include <cstdio>
19 | #include <ctime>
20 | 
21 | #include <sys/syscall.h>
22 | #include <sys/time.h>
23 | #include <sys/types.h>
24 | #include <unistd.h>
25 | 
26 | #include "hybridbackend/common/env.h"
27 | 
28 | namespace hybridbackend {
29 | 
30 | int& MinLogLevel() {
31 |   static int* min_log_level = new int(EnvVarGetInt("HB_MIN_LOG_LEVEL", 0));
32 |   return *min_log_level;
33 | }
34 | 
35 | LogMessage::LogMessage(const char* fname, int line)
36 |     : fname_(fname), line_(line) {}
37 | 
38 | LogMessage::~LogMessage() {
39 |   static size_t pid = static_cast<size_t>(getpid());
40 |   struct timeval tv;
41 |   struct timezone tz;
42 |   gettimeofday(&tv, &tz);
43 |   struct tm rslt;
44 |   struct tm* p = gmtime_r(&tv.tv_sec, &rslt);
45 |   fprintf(stderr, "[%04d-%02d-%02d %02d:%02d:%02d.%ld] [%ld#%ld] [%s:%d] %s\n",
46 |           1900 + p->tm_year, 1 + p->tm_mon, p->tm_mday, p->tm_hour, p->tm_min,
47 |           p->tm_sec, tv.tv_usec, pid, syscall(SYS_gettid), fname_, line_,
48 |           str().c_str());
49 | }
50 | 
51 | }  // namespace hybridbackend
52 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/keras/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''HybridBackend Keras Layers.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | try:
24 |   from tensorflow.python.feature_column.dense_features import DenseFeatures
25 | except ImportError:
26 |   pass
27 | 
28 | 
29 | def dense_features(features, feature_columns):
30 |   r'''Function produces dense tensors based on given `feature_columns`.
31 | 
32 |   Args:
33 |     features: A mapping from key to tensors. `FeatureColumn`s look up via
34 |       these keys. For example `numeric_column('price')` will look at 'price'
35 |       key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
36 |       on corresponding `FeatureColumn`.
37 |     feature_columns: List of feature columns.
38 | 
39 |   Returns:
40 |     List of `Tensor`s which represents input layer of a model, which matches
41 |     order of columns in `feature_columns`.
42 |   '''
43 |   cols_to_output_tensors = {}
44 |   DenseFeatures(feature_columns)(
45 |     features, cols_to_output_tensors=cols_to_output_tensors)
46 |   return [cols_to_output_tensors[f] for f in feature_columns]
47 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/slice_sum.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_
16 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_
17 | 
18 | #if HYBRIDBACKEND_TENSORFLOW
19 | 
20 | #include <vector>
21 | 
22 | #include <tensorflow/core/framework/tensor.h>
23 | #include <tensorflow/core/public/version.h>
24 | 
25 | namespace tensorflow {
26 | 
27 | class OpKernelContext;
28 | 
29 | namespace hybridbackend {
30 | namespace functor {
31 | 
32 | template <typename Device, typename T>
33 | struct SliceSum {
34 |   void operator()(const int32 num_rows, const int32 num_cols, const int32 col,
35 |                   const T* input, T* output_total, T* output,
36 |                   const Eigen::GpuDevice& d);
37 | };
38 | 
39 | template <typename Device, typename T>
40 | struct SliceSumN {
41 |   void operator()(const int32 num_rows, const int32 num_cols, const int32 col,
42 |                   const int32 num_inputs, const T* inputs, T* output_totals,
43 |                   T** outputs, const Eigen::GpuDevice& d);
44 | };
45 | 
46 | }  // namespace functor
47 | }  // namespace hybridbackend
48 | }  // namespace tensorflow
49 | 
50 | #endif  // HYBRIDBACKEND_TENSORFLOW
51 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_SLICE_SUM_H_
52 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/op_optimization.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include <tensorflow/core/common_runtime/optimization_registry.h>
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | class OpOptimizationPass : public GraphOptimizationPass {
30 |  public:
31 |   virtual Status Run(const GraphOptimizationPassOptions& options);
32 | 
33 |  protected:
34 |   virtual Status Optimize(Graph* graph, const SessionOptions* options,
35 |                           const bool disabled) = 0;
36 | };
37 | 
38 | #define REGISTER_REPLACING_OPTIMIZATION(PASS) \
39 |   REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1, PASS)
40 | 
41 | #define REGISTER_REDUCTION_OPTIMIZATION(PASS)                                 \
42 |   REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 100, \
43 |                         PASS)
44 | 
45 | }  // namespace hybridbackend
46 | }  // namespace tensorflow
47 | 
48 | #endif  // HYBRIDBACKEND_TENSORFLOW
49 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_OP_OPTIMIZATION_H_
50 | 


--------------------------------------------------------------------------------
/hybridbackend/common/profiler.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "hybridbackend/common/profiler.h"
17 | 
18 | namespace hybridbackend {
19 | 
20 | ProfilerRange* ProfilerRange::forSynch(const std::string& message) {
21 | #if HYBRIDBACKEND_NVTX
22 |   return new ProfilerRange("Synch Ops", message.c_str());
23 | #else
24 |   return nullptr;
25 | #endif
26 | }
27 | 
28 | ProfilerRange* ProfilerRange::forLookup(const std::string& message) {
29 | #if HYBRIDBACKEND_NVTX
30 |   return new ProfilerRange("Lookup Ops", message.c_str());
31 | #else
32 |   return nullptr;
33 | #endif
34 | }
35 | 
36 | ProfilerRange::ProfilerRange(const std::string& domain,
37 |                              const std::string& message) {
38 | #if HYBRIDBACKEND_NVTX
39 |   domain_ = nvtxDomainCreateA(domain.c_str());
40 |   nvtxEventAttributes_t nvtx_attr = {0};
41 |   nvtx_attr.version = NVTX_VERSION;
42 |   nvtx_attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
43 |   nvtx_attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
44 |   nvtx_attr.message.ascii = message.c_str();
45 |   range_ = nvtxDomainRangeStartEx(domain_, &nvtx_attr);
46 | #endif
47 | }
48 | 
49 | ProfilerRange::~ProfilerRange() {
50 | #if HYBRIDBACKEND_NVTX
51 |   nvtxDomainRangeEnd(domain_, range_);
52 | #endif
53 | }
54 | 
55 | }  // namespace hybridbackend
56 | 


--------------------------------------------------------------------------------
/hybridbackend/common/env.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_ENV_H_
17 | #define HYBRIDBACKEND_COMMON_ENV_H_
18 | 
19 | #include <string>
20 | 
21 | namespace hybridbackend {
22 | 
23 | void EnvVarSet(const std::string& env_var, const std::string& env_val);
24 | 
25 | void EnvVarSet(const std::string& env_var, const int env_val);
26 | 
27 | void EnvVarSetIfNotExists(const std::string& env_var,
28 |                           const std::string& env_val);
29 | 
30 | void EnvVarSetIfNotExists(const std::string& env_var, const int env_val);
31 | 
32 | std::string EnvVarGet(const std::string& env_var,
33 |                       const std::string& default_val);
34 | 
35 | int EnvVarGetInt(const std::string& env_var, const int default_val);
36 | 
37 | bool EnvVarGetBool(const std::string& env_var, const bool default_val);
38 | 
39 | std::string EnvHttpGet(const std::string& url, const std::string& default_val,
40 |                        const long timeout);
41 | 
42 | int EnvHttpGetInt(const std::string& url, const int default_val,
43 |                   const long timeout);
44 | 
45 | bool EnvCheckInstance(const long timeout);
46 | 
47 | int EnvGetGpuInfo(int* count, int* major, int* minor);
48 | 
49 | }  // namespace hybridbackend
50 | 
51 | #endif  // HYBRIDBACKEND_COMMON_ENV_H_
52 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/replacing.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <map>
22 | #include <string>
23 | #include <vector>
24 | 
25 | #include <tensorflow/core/graph/graph.h>
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | class Replace {
31 |  public:
32 |   Replace(const string& op_type, const string& optimized_op_type);
33 |   Replace& WithDevice(const string& device);
34 |   Replace& WithTypeAttr(const string& attr_name,
35 |                         const std::vector<DataType>& constraints);
36 |   Replace& WithExtraIntAttr(const string& attr_name);
37 |   Replace& Packed();
38 |   Status In(Graph* graph);
39 |   Status In(Graph* graph, int64* poccurrence_count);
40 | 
41 |  private:
42 |   string op_type_;
43 |   string optimized_op_type_;
44 |   string device_;
45 |   bool packed_;
46 |   std::map<string, std::vector<DataType>> type_attrs_;
47 |   std::vector<string> extra_int_attrs_;
48 | 
49 |   TF_DISALLOW_COPY_AND_ASSIGN(Replace);
50 | };
51 | 
52 | }  // namespace hybridbackend
53 | }  // namespace tensorflow
54 | 
55 | #endif  // HYBRIDBACKEND_TENSORFLOW
56 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REPLACING_H_
57 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Operations in HybridBackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os as _os
24 | 
25 | 
26 | def disable_optimization():
27 |   r'''Disable optimizations for operators on GPU.
28 |   '''
29 |   _os.environ['HB_OP_OPTIMIZATION_DISABLED'] = '1'
30 | 
31 | 
32 | def enable_optimization(logging_level=None, relocate_ops=False):
33 |   r'''Enable optimizations for operators on GPU.
34 | 
35 |   Args:
36 |     logging_level: Level of details to optimize operators.
37 |     relocate_ops: Enable relocation of ops.
38 |   '''
39 |   _os.environ['HB_OP_OPTIMIZATION_DISABLED'] = '0'
40 |   if logging_level is not None:
41 |     if 'TF_CPP_VMODULE' not in _os.environ:
42 |       _os.environ['TF_CPP_VMODULE'] = ''
43 |     if _os.environ['TF_CPP_VMODULE']:
44 |       _os.environ['TF_CPP_VMODULE'] += ','
45 |     _os.environ['TF_CPP_VMODULE'] += (
46 |       f'op_optimization={logging_level},'
47 |       f'replacing={logging_level},'
48 |       f'pruning={logging_level},'
49 |       f'relocation={logging_level},'
50 |       f'packing={logging_level},'
51 |       f'fusion={logging_level}')
52 |   if relocate_ops:
53 |     _os.environ['HB_OP_RELOCATION_ENABLED'] = '1'
54 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/partition/modulo_functors.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_
16 | #define HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_
17 | 
18 | #if HYBRIDBACKEND_TENSORFLOW
19 | 
20 | #include <tensorflow/core/framework/tensor.h>
21 | #include <tensorflow/core/public/version.h>
22 | 
23 | namespace tensorflow {
24 | 
25 | class OpKernelContext;
26 | 
27 | namespace hybridbackend {
28 | namespace functor {
29 | 
30 | template <typename Device, typename T>
31 | struct PartitionByModulo {
32 |   void operator()(const int32 num_partitions, const Tensor& input,
33 |                   Tensor* output, Tensor* sizes, Tensor* indices,
34 |                   OpKernelContext* ctx);
35 | };
36 | 
37 | template <typename Device, typename T>
38 | struct PartitionByModuloN {
39 |   void operator()(const int32 num_partitions, const std::vector<Tensor>& inputs,
40 |                   std::vector<Tensor*>& outputs,
41 |                   std::vector<Tensor*>& outputs_sizes,
42 |                   std::vector<Tensor*>& outputs_indices, OpKernelContext* ctx);
43 | };
44 | 
45 | }  // namespace functor
46 | }  // namespace hybridbackend
47 | }  // namespace tensorflow
48 | 
49 | #endif  // HYBRIDBACKEND_TENSORFLOW
50 | #endif  // HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_MODULO_FUNCTORS_H_
51 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''HybridBackend for TensorFlow.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import contextlib as _ctxlib
24 | 
25 | from hybridbackend.libhybridbackend import buildinfo
26 | from hybridbackend.tensorflow.framework.config import get_session_config
27 | from hybridbackend.tensorflow.framework.config import wraps_session_config
28 | from hybridbackend.tensorflow.framework.context import Context
29 | from hybridbackend.tensorflow.framework.context import context
30 | from hybridbackend.tensorflow.framework.rewriting import function
31 | from hybridbackend.tensorflow.framework.rewriting import scope
32 | from hybridbackend.tensorflow.ops import *
33 | from hybridbackend.tensorflow.wraps import wraps
34 | 
35 | from . import data
36 | from . import distribute
37 | from . import embedding
38 | from . import estimator
39 | from . import keras
40 | from . import metrics
41 | from . import pipeline
42 | from . import training as train
43 | 
44 | __version__ = buildinfo()
45 | 
46 | 
47 | @_ctxlib.contextmanager
48 | def embedding_scope(**kwargs):
49 |   r'''Scope for defining embedding weights.
50 |   '''
51 |   kwargs.setdefault('sharding', True)
52 |   with scope(**kwargs) as ctx:
53 |     yield ctx
54 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/embedding/lookup_functors.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_
16 | #define HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_
17 | 
18 | #if HYBRIDBACKEND_TENSORFLOW
19 | 
20 | #include <tensorflow/core/framework/op_kernel.h>
21 | #include <tensorflow/core/framework/tensor.h>
22 | #include <tensorflow/core/framework/tensor_reference.h>
23 | #include <tensorflow/core/public/version.h>
24 | 
25 | #if GOOGLE_CUDA
26 | #include <cuda.h>
27 | #include <cuda_runtime.h>
28 | #endif
29 | 
30 | namespace tensorflow {
31 | 
32 | class OpKernelContext;
33 | 
34 | namespace hybridbackend {
35 | namespace functor {
36 | 
37 | #if GOOGLE_CUDA
38 | template <typename T>
39 | struct LookupFunctor {
40 |  public:
41 |   typedef T Type;
42 | 
43 |   void operator()(int32* d_miss_count, int32* d_hit_and_miss_keys_indices,
44 |                   T* d_hit_cache_indices_and_miss_keys,
45 |                   const T keys_cache_slab_count, const T* d_keys_cache,
46 |                   const int32 key_count, const T* d_keys,
47 |                   const Eigen::GpuDevice& d);
48 | };
49 | 
50 | #endif  // GOOGLE_CUDA
51 | 
52 | }  // namespace functor
53 | }  // namespace hybridbackend
54 | }  // namespace tensorflow
55 | 
56 | #endif  // HYBRIDBACKEND_TENSORFLOW
57 | #endif  // HYBRIDBACKEND_TENSORFLOW_EMBEDDING_LOOKUP_FUNCTORS_H_
58 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/sync/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''SyncReplicasDataset that syncs data between replicas.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | try:
24 |   from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset  # pylint: disable=unused-import, ungrouped-imports, line-too-long # noqa: F401
25 | 
26 |   from hybridbackend.tensorflow.data.sync.dataset_v2 import \
27 |     _SyncReplicasDatasetV2 as _SyncReplicasDataset
28 |   _SyncReplicasDataset.__module__ = __name__
29 |   _SyncReplicasDataset.__name__ = '_SyncReplicasDataset'
30 | 
31 |   from hybridbackend.tensorflow.data.sync.dataset_v2 import \
32 |     SyncReplicasDatasetV2 as SyncReplicasDataset
33 |   SyncReplicasDataset.__module__ = __name__
34 |   SyncReplicasDataset.__name__ = 'SyncReplicasDataset'
35 | except ImportError:
36 |   from hybridbackend.tensorflow.data.sync.dataset_v1 import \
37 |     _SyncReplicasDatasetV1 as _SyncReplicasDataset
38 |   _SyncReplicasDataset.__module__ = __name__
39 |   _SyncReplicasDataset.__name__ = '_SyncReplicasDataset'
40 | 
41 |   from hybridbackend.tensorflow.data.sync.dataset_v1 import \
42 |     SyncReplicasDatasetV1 as SyncReplicasDataset
43 |   SyncReplicasDataset.__module__ = __name__
44 |   SyncReplicasDataset.__name__ = 'SyncReplicasDataset'
45 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/relocation.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include <tensorflow/core/graph/graph.h>
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | class RelocateOutputs {
30 |  public:
31 |   RelocateOutputs(const string& op_type);
32 |   RelocateOutputs& WithDevice(const string& device);
33 |   RelocateOutputs& Force();
34 |   Status In(Graph* graph);
35 |   Status In(Graph* graph, int64* poccurrence_count);
36 | 
37 |  private:
38 |   string op_type_;
39 |   string device_;
40 |   bool force_;
41 | 
42 |   TF_DISALLOW_COPY_AND_ASSIGN(RelocateOutputs);
43 | };
44 | 
45 | class Relocate {
46 |  public:
47 |   Relocate(const string& op_type);
48 |   Relocate& WithDevice(const string& device);
49 |   Relocate& WithInput(const int32 input);
50 |   Status In(Graph* graph);
51 |   Status In(Graph* graph, int64* poccurrence_count);
52 | 
53 |  private:
54 |   string op_type_;
55 |   string device_;
56 |   int32 input_;
57 | 
58 |   TF_DISALLOW_COPY_AND_ASSIGN(Relocate);
59 | };
60 | 
61 | }  // namespace hybridbackend
62 | }  // namespace tensorflow
63 | 
64 | #endif  // HYBRIDBACKEND_TENSORFLOW
65 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_RELOCATION_H_
66 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/helper.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include <tensorflow/core/graph/algorithm.h>
25 | #include <tensorflow/core/graph/graph.h>
26 | #include <tensorflow/core/public/version.h>
27 | 
28 | namespace tensorflow {
29 | namespace hybridbackend {
30 | 
31 | #if (TF_MAJOR_VERSION * 1000L + TF_MINOR_VERSION) < 1014L
32 | template <typename T>
33 | void DFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
34 |                    const std::function<void(T)>& enter,
35 |                    const std::function<void(T)>& leave,
36 |                    const NodeComparator& stable_comparator,
37 |                    const EdgeFilter& edge_filter);
38 | 
39 | void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
40 |              const std::function<void(Node*)>& enter,
41 |              const std::function<void(Node*)>& leave,
42 |              const NodeComparator& stable_comparator = {},
43 |              const EdgeFilter& edge_filter = {});
44 | #endif
45 | 
46 | string NodeJoin(const std::vector<Node*>& nodes, const string& delim);
47 | 
48 | }  // namespace hybridbackend
49 | }  // namespace tensorflow
50 | 
51 | #endif  // HYBRIDBACKEND_TENSORFLOW
52 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_HELPER_H_
53 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tabular/orc.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_
18 | 
19 | #include <memory>
20 | #include <vector>
21 | 
22 | #include <tensorflow/core/framework/op_kernel.h>
23 | #include <tensorflow/core/framework/types.h>
24 | 
25 | #include "hybridbackend/tensorflow/data/tabular/table.h"
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | class OrcAccess : public TableAccess {
31 |  public:
32 |   OrcAccess(OpKernelContext* ctx, const TableFormat& format,
33 |             const string& filename, const int64 batch_size,
34 |             const std::vector<string>& field_names,
35 |             const DataTypeVector& field_dtypes,
36 |             const std::vector<int32>& field_ragged_ranks,
37 |             const std::vector<PartialTensorShape>& field_shapes,
38 |             const bool drop_remainder, const bool skip_corrupted_data);
39 | 
40 |   virtual int64 Count() const override;
41 | 
42 |   virtual Status Open() override;
43 | 
44 |   virtual Status Open(const int64 start, const int64 end) override;
45 | 
46 |   virtual Status Read(std::vector<Tensor>* output_tensors) override;
47 | 
48 |   virtual ~OrcAccess();
49 | 
50 |  private:
51 |   class Impl;
52 |   std::unique_ptr<Impl> pimpl_;
53 | };
54 | 
55 | }  // namespace hybridbackend
56 | }  // namespace tensorflow
57 | 
58 | #endif  // HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_ORC_H_
59 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/optimize_memory.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <map>
19 | #include <set>
20 | #include <vector>
21 | 
22 | #include <stdlib.h>
23 | 
24 | #include <absl/strings/str_cat.h>
25 | #include <tensorflow/core/graph/node_builder.h>
26 | 
27 | #include "hybridbackend/common/env.h"
28 | #include "hybridbackend/tensorflow/graph/common/packing.h"
29 | #include "hybridbackend/tensorflow/graph/common/rewriting.h"
30 | #include "hybridbackend/tensorflow/graph/op_optimization.h"
31 | 
32 | namespace tensorflow {
33 | namespace hybridbackend {
34 | 
35 | class OptimizeMemoryReplacingPass : public OpOptimizationPass {
36 |  public:
37 |   Status Optimize(Graph* graph, const SessionOptions* options,
38 |                   const bool disabled) override {
39 |     if (TF_PREDICT_FALSE(disabled)) {
40 |       return Status::OK();
41 |     }
42 | 
43 |     ::hybridbackend::EnvVarSetIfNotExists("HB_MEMORY_DECAY_MILLIS", 60000);
44 |     const int kMemoryDecayMillis =
45 |         ::hybridbackend::EnvVarGetInt("HB_MEMORY_DECAY_MILLIS", 0);
46 |     ::hybridbackend::EnvVarSetIfNotExists(
47 |         "MALLOC_CONF", "background_thread:true,metadata_thp:auto");
48 |     VLOG(1) << "Memory decay set to " << kMemoryDecayMillis << "ms";
49 | 
50 |     return Status::OK();
51 |   }
52 | };
53 | 
54 | REGISTER_REPLACING_OPTIMIZATION(OptimizeMemoryReplacingPass);
55 | 
56 | }  // namespace hybridbackend
57 | }  // namespace tensorflow
58 | 
59 | #endif  // HYBRIDBACKEND_TENSORFLOW
60 | 


--------------------------------------------------------------------------------
/.github/workflows/cpu.yaml:
--------------------------------------------------------------------------------
 1 | name: release deploy on cpu
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | env:
 6 |   IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.6-manylinux_2_24
 7 |   JOBNAME: hbci-${{ github.run_id }}
 8 |   PODNAME: hbci-${{ github.run_id }}-chief-0
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     environment: tf1.15-py3.6-manylinux_2_24
14 |     steps:
15 |     - name: Checkout Code
16 |       uses: actions/checkout@v3
17 |       with:
18 |         submodules: 'true'
19 |     - name: Setup Environment
20 |       uses: aliyun/ack-set-context@v1
21 |       with:
22 |         access-key-id: "${{ secrets.ACCESS_KEY_ID }}"
23 |         access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}"
24 |         cluster-id: "${{ secrets.ACK_CLUSTER_ID }}"
25 |     - name: Upload
26 |       run: |-
27 |         helm install ${JOBNAME} .github/helm/ \
28 |         --set image=${IMAGE} \
29 |         --set gpus=0 && \
30 |         .github/helm/upload ${PODNAME}
31 |     - name: Build & Check
32 |       run: |-
33 |         kubectl exec -it ${PODNAME} -- \
34 |         build/install HB_TEST_LOGDIR=build/reports
35 |     - name: Download
36 |       run: |-
37 |         kubectl exec -it ${PODNAME} -- \
38 |         sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \
39 |         kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \
40 |         mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \
41 |         kubectl exec -it ${PODNAME} -- \
42 |         sh -c 'tar -czf reports.tgz -C build/reports/ .' && \
43 |         kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \
44 |         mkdir -p reports && tar -xzf ./reports.tgz -C reports/
45 |     - name: Report
46 |       uses: EnricoMi/publish-unit-test-result-action@v2
47 |       with:
48 |         check_name: Test Results
49 |         files: "reports/**/*.xml"
50 |     - name: Publish
51 |       uses: pypa/gh-action-pypi-publish@release/v1
52 |       with:
53 |         skip_existing: true
54 |         user: __token__
55 |         password: ${{ secrets.PYPI_API_TOKEN }}
56 |     - name: Cleanup Environment
57 |       if: always()
58 |       run: |-
59 |         helm uninstall ${JOBNAME}
60 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tabular/parquet.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_
18 | 
19 | #include <memory>
20 | #include <vector>
21 | 
22 | #include <tensorflow/core/framework/op_kernel.h>
23 | #include <tensorflow/core/framework/types.h>
24 | 
25 | #include "hybridbackend/tensorflow/data/tabular/table.h"
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | class ParquetAccess : public TableAccess {
31 |  public:
32 |   ParquetAccess(OpKernelContext* ctx, const TableFormat& format,
33 |                 const string& filename, const int64 batch_size,
34 |                 const std::vector<string>& field_names,
35 |                 const DataTypeVector& field_dtypes,
36 |                 const std::vector<int32>& field_ragged_ranks,
37 |                 const std::vector<PartialTensorShape>& field_shapes,
38 |                 const bool drop_remainder, const bool skip_corrupted_data);
39 | 
40 |   virtual int64 Count() const override;
41 | 
42 |   virtual Status Open() override;
43 | 
44 |   virtual Status Open(const int64 start, const int64 end) override;
45 | 
46 |   virtual Status Read(std::vector<Tensor>* output_tensors) override;
47 | 
48 |   virtual ~ParquetAccess();
49 | 
50 |  private:
51 |   class Impl;
52 |   std::unique_ptr<Impl> pimpl_;
53 | };
54 | 
55 | }  // namespace hybridbackend
56 | }  // namespace tensorflow
57 | 
58 | #endif  // HYBRIDBACKEND_TENSORFLOW_DATA_TABULAR_PARQUET_H_
59 | 


--------------------------------------------------------------------------------
/.github/workflows/gpu.yaml:
--------------------------------------------------------------------------------
 1 | name: release deploy on gpu
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | env:
 6 |   IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04
 7 |   JOBNAME: hbci-${{ github.run_id }}
 8 |   PODNAME: hbci-${{ github.run_id }}-chief-0
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     environment: tf1.15-py3.8-cu121-ubuntu20.04
14 |     steps:
15 |     - name: Checkout Code
16 |       uses: actions/checkout@v3
17 |       with:
18 |         submodules: 'true'
19 |     - name: Setup Environment
20 |       uses: aliyun/ack-set-context@v1
21 |       with:
22 |         access-key-id: "${{ secrets.ACCESS_KEY_ID }}"
23 |         access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}"
24 |         cluster-id: "${{ secrets.ACK_CLUSTER_ID }}"
25 |     - name: Upload
26 |       run: |-
27 |         helm install ${JOBNAME} .github/helm/ \
28 |         --set image=${IMAGE} \
29 |         --set gpus=2 && \
30 |         .github/helm/upload ${PODNAME}
31 |     - name: Build & Check
32 |       run: |-
33 |         kubectl exec -it ${PODNAME} -- \
34 |         build/install HB_TEST_LOGDIR=build/reports
35 |     - name: Download
36 |       run: |-
37 |         kubectl exec -it ${PODNAME} -- \
38 |         sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \
39 |         kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \
40 |         mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \
41 |         kubectl exec -it ${PODNAME} -- \
42 |         sh -c 'tar -czf reports.tgz -C build/reports/ .' && \
43 |         kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \
44 |         mkdir -p reports && tar -xzf ./reports.tgz -C reports/
45 |     - name: Report
46 |       uses: EnricoMi/publish-unit-test-result-action@v2
47 |       with:
48 |         check_name: Test Results
49 |         files: "reports/**/*.xml"
50 |     - name: Publish
51 |       uses: pypa/gh-action-pypi-publish@release/v1
52 |       with:
53 |         skip_existing: true
54 |         user: __token__
55 |         password: ${{ secrets.PYPI_API_TOKEN }}
56 |     - name: Cleanup Environment
57 |       if: always()
58 |       run: |-
59 |         helm uninstall ${JOBNAME}
60 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | # isort: skip_file
16 | 
17 | r'''Input pipelines.
18 | '''
19 | 
20 | from __future__ import absolute_import
21 | from __future__ import division
22 | from __future__ import print_function
23 | 
24 | from hybridbackend.tensorflow.data.dataframe import DataFrame
25 | from hybridbackend.tensorflow.data.dataframe import parse
26 | from hybridbackend.tensorflow.data.dataframe import populate_defaults
27 | from hybridbackend.tensorflow.data.dataframe import unbatch_and_to_sparse
28 | from hybridbackend.tensorflow.data.deduplicate.dataset import deduplicate
29 | from hybridbackend.tensorflow.data.prefetch.iterator import Iterator
30 | from hybridbackend.tensorflow.data.rebatch.dataset import RebatchDataset
31 | from hybridbackend.tensorflow.data.rebatch.dataset import rebatch
32 | from hybridbackend.tensorflow.data.sync.dataset import SyncReplicasDataset
33 | from hybridbackend.tensorflow.data.tabular.dataset import Dataset
34 | from hybridbackend.tensorflow.data.tabular.dataset import ParquetDataset
35 | from hybridbackend.tensorflow.data.tabular.dataset import read_parquet
36 | 
37 | # HybridBackend operators must be loaded before TensorFlow operators to
38 | # make AWS SDK implementation correct.
39 | from hybridbackend.tensorflow.data.iterators import make_initializable_iterator
40 | from hybridbackend.tensorflow.data.iterators import make_one_shot_iterator
41 | from hybridbackend.tensorflow.framework.context import Context as _ctx
42 | 
43 | _ = (
44 |   _ctx.get().options
45 |   .register('data_batch_count', 1)
46 |   .register('data_sync_drop_remainder', True))
47 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/partition/dual_modulo_functors.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_
16 | #define HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_
17 | 
18 | #if HYBRIDBACKEND_TENSORFLOW
19 | 
20 | #include <tensorflow/core/framework/tensor.h>
21 | #include <tensorflow/core/public/version.h>
22 | 
23 | namespace tensorflow {
24 | 
25 | class OpKernelContext;
26 | 
27 | namespace hybridbackend {
28 | namespace functor {
29 | 
30 | struct ComputeShardAtStageOne;
31 | struct ComputeShardAtStageTwo;
32 | struct ComputeShardOnGpuAtStageOne;
33 | struct ComputeShardOnGpuAtStageTwo;
34 | 
35 | template <typename Device, typename T, typename Stage>
36 | struct PartitionByDualModulo {
37 |   void operator()(const int32 num_partitions, const int32 modulus,
38 |                   const Tensor& input, Tensor* output, Tensor* sizes,
39 |                   Tensor* indices, OpKernelContext* ctx);
40 | };
41 | 
42 | template <typename Device, typename T, typename Stage>
43 | struct PartitionByDualModuloN {
44 |   void operator()(const int32 num_partitions, const int32 modulus,
45 |                   const std::vector<Tensor>& inputs,
46 |                   std::vector<Tensor*>& outputs,
47 |                   std::vector<Tensor*>& outputs_sizes,
48 |                   std::vector<Tensor*>& outputs_indices, OpKernelContext* ctx);
49 | };
50 | 
51 | }  // namespace functor
52 | }  // namespace hybridbackend
53 | }  // namespace tensorflow
54 | 
55 | #endif  // HYBRIDBACKEND_TENSORFLOW
56 | #endif  // HYBRIDBACKEND_TENSORFLOW_DISTRIBUTE_PARTITION_DUAL_MODULO_FUNCTORS_H_
57 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/rewriting.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | #include <tensorflow/core/graph/graph.h>
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | class Rewrite {
30 |  public:
31 |   Rewrite(const string& op_like_name, const string& op_name);
32 |   Rewrite& WithDevice(const string& device);
33 |   Rewrite& WithTypeAttr(const string& attr_name, const DataType& default_attr);
34 |   Rewrite& WithShapeAttr(const string& attr_name,
35 |                          const TensorShape& default_attr);
36 |   Rewrite& WithIntAttr(const string& attr_name, const int32& default_attr);
37 |   Rewrite& WithStrAttr(const string& attr_name, const string& default_attr);
38 |   Rewrite& WithTypeListAttr(const string& attr_name);
39 |   Status In(Graph* graph);
40 |   Status In(Graph* graph, int64* poccurrence_count);
41 | 
42 |  private:
43 |   string op_like_name_;
44 |   string op_name_;
45 |   string device_;
46 |   int32 num_inputs_;
47 |   std::map<string, DataType> type_attrs_;
48 |   std::map<string, TensorShape> shape_attrs_;
49 |   std::map<string, int32> int_attrs_;
50 |   std::map<string, string> str_attrs_;
51 |   std::vector<string> type_list_attrs_;
52 | 
53 |   TF_DISALLOW_COPY_AND_ASSIGN(Rewrite);
54 | };
55 | 
56 | }  // namespace hybridbackend
57 | }  // namespace tensorflow
58 | 
59 | #endif  // HYBRIDBACKEND_TENSORFLOW
60 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_REWRITING_H_
61 | 


--------------------------------------------------------------------------------
/.github/workflows/cpu-nightly.yaml:
--------------------------------------------------------------------------------
 1 | name: nightly deploy on cpu
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | env:
 6 |   IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.6-manylinux_2_24
 7 |   JOBNAME: hbci-${{ github.run_id }}
 8 |   PODNAME: hbci-${{ github.run_id }}-chief-0
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     environment: tf1.15-py3.6-manylinux_2_24
14 |     steps:
15 |     - name: Checkout Code
16 |       uses: actions/checkout@v3
17 |       with:
18 |         submodules: 'true'
19 |     - name: Setup Environment
20 |       uses: aliyun/ack-set-context@v1
21 |       with:
22 |         access-key-id: "${{ secrets.ACCESS_KEY_ID }}"
23 |         access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}"
24 |         cluster-id: "${{ secrets.ACK_CLUSTER_ID }}"
25 |     - name: Upload
26 |       run: |-
27 |         helm install ${JOBNAME} .github/helm/ \
28 |         --set image=${IMAGE} \
29 |         --set build=.dev${{ github.run_id }} \
30 |         --set gpus=0 && \
31 |         .github/helm/upload ${PODNAME}
32 |     - name: Build & Check
33 |       run: |-
34 |         kubectl exec -it ${PODNAME} -- \
35 |         build/install HB_TEST_LOGDIR=build/reports
36 |     - name: Download
37 |       run: |-
38 |         kubectl exec -it ${PODNAME} -- \
39 |         sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \
40 |         kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \
41 |         mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \
42 |         kubectl exec -it ${PODNAME} -- \
43 |         sh -c 'tar -czf reports.tgz -C build/reports/ .' && \
44 |         kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \
45 |         mkdir -p reports && tar -xzf ./reports.tgz -C reports/
46 |     - name: Report
47 |       uses: EnricoMi/publish-unit-test-result-action@v2
48 |       with:
49 |         check_name: Test Results
50 |         files: "reports/**/*.xml"
51 |     - name: Publish
52 |       uses: pypa/gh-action-pypi-publish@release/v1
53 |       with:
54 |         skip_existing: true
55 |         user: __token__
56 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
57 |         repository_url: https://test.pypi.org/legacy/
58 |     - name: Cleanup Environment
59 |       if: always()
60 |       run: |-
61 |         helm uninstall ${JOBNAME}
62 | 


--------------------------------------------------------------------------------
/.github/workflows/gpu-nightly.yaml:
--------------------------------------------------------------------------------
 1 | name: nightly deploy on gpu
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | env:
 6 |   IMAGE: registry.cn-shanghai.aliyuncs.com/pai-dlc/hybridbackend:developer-tf1.15-py3.8-cu121-ubuntu20.04
 7 |   JOBNAME: hbci-${{ github.run_id }}
 8 |   PODNAME: hbci-${{ github.run_id }}-chief-0
 9 | 
10 | jobs:
11 |   deploy:
12 |     runs-on: ubuntu-latest
13 |     environment: tf1.15-py3.8-cu121-ubuntu20.04
14 |     steps:
15 |     - name: Checkout Code
16 |       uses: actions/checkout@v3
17 |       with:
18 |         submodules: 'true'
19 |     - name: Setup Environment
20 |       uses: aliyun/ack-set-context@v1
21 |       with:
22 |         access-key-id: "${{ secrets.ACCESS_KEY_ID }}"
23 |         access-key-secret: "${{ secrets.ACCESS_KEY_SECRET }}"
24 |         cluster-id: "${{ secrets.ACK_CLUSTER_ID }}"
25 |     - name: Upload
26 |       run: |-
27 |         helm install ${JOBNAME} .github/helm/ \
28 |         --set image=${IMAGE} \
29 |         --set build=.dev${{ github.run_id }} \
30 |         --set gpus=2 && \
31 |         .github/helm/upload ${PODNAME}
32 |     - name: Build & Check
33 |       run: |-
34 |         kubectl exec -it ${PODNAME} -- \
35 |         build/install HB_TEST_LOGDIR=build/reports
36 |     - name: Download
37 |       run: |-
38 |         kubectl exec -it ${PODNAME} -- \
39 |         sh -c 'tar -czf hybridbackend.tgz -C build/release/ .' && \
40 |         kubectl cp ${PODNAME}:hybridbackend.tgz ./hybridbackend.tgz --retries=3 && \
41 |         mkdir -p dist && tar -xzf ./hybridbackend.tgz -C dist/ && \
42 |         kubectl exec -it ${PODNAME} -- \
43 |         sh -c 'tar -czf reports.tgz -C build/reports/ .' && \
44 |         kubectl cp ${PODNAME}:reports.tgz ./reports.tgz --retries=3 && \
45 |         mkdir -p reports && tar -xzf ./reports.tgz -C reports/
46 |     - name: Report
47 |       uses: EnricoMi/publish-unit-test-result-action@v2
48 |       with:
49 |         check_name: Test Results
50 |         files: "reports/**/*.xml"
51 |     - name: Publish
52 |       uses: pypa/gh-action-pypi-publish@release/v1
53 |       with:
54 |         skip_existing: true
55 |         user: __token__
56 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
57 |         repository_url: https://test.pypi.org/legacy/
58 |     - name: Cleanup Environment
59 |       if: always()
60 |       run: |-
61 |         helm uninstall ${JOBNAME}
62 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/fusion_helper.cu.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <tensorflow/core/framework/op_kernel.h>
22 | #include <tensorflow/core/framework/tensor.h>
23 | #include <tensorflow/core/framework/tensor_reference.h>
24 | #include <tensorflow/core/public/version.h>
25 | 
26 | namespace tensorflow {
27 | 
28 | class OpKernelContext;
29 | 
30 | namespace hybridbackend {
31 | 
32 | template <typename T, typename N>
33 | __global__ void SetToNValue(const Cuda2DLaunchConfig config, const N* counts,
34 |                             T** ptr, T value) {
35 |   CUDA_AXIS_KERNEL_LOOP(g_idx, config.virtual_thread_count.y, Y) {
36 |     CUDA_AXIS_KERNEL_LOOP(g_offset, config.virtual_thread_count.x, X) {
37 |       if (g_offset < counts[g_idx]) {
38 |         ptr[g_idx][g_offset] = value;
39 |       }
40 |     }
41 |   }
42 | }
43 | 
44 | namespace functor {
45 | 
46 | #if GOOGLE_CUDA
47 | 
48 | template <typename T>
49 | struct CopyPtrsNFunctor {
50 |   void operator()(OpKernelContext* ctx, int8* head_host, int8* head_device,
51 |                   std::vector<const Tensor*>* inputs, int num_columns);
52 | };
53 | 
54 | template <typename T>
55 | struct CopySizesNFunctor {
56 |   void operator()(OpKernelContext* ctx, T* input_host, T* input_device,
57 |                   int num_columns);
58 | };
59 | 
60 | #endif  // GOOGLE_CUDA
61 | 
62 | }  // namespace functor
63 | }  // namespace hybridbackend
64 | }  // namespace tensorflow
65 | 
66 | #endif  // HYBRIDBACKEND_TENSORFLOW
67 | 
68 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_FUSION_HELPER_H_
69 | 


--------------------------------------------------------------------------------
/docs/tutorial/ranking/taobao/data/stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # =============================================================================
17 | 
18 | r'''Calculate statistics of Taobao Click Logs Dataset.
19 | 
20 | See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
21 | information.
22 | '''
23 | 
24 | from __future__ import absolute_import
25 | from __future__ import division
26 | from __future__ import print_function
27 | 
28 | import argparse
29 | import logging
30 | 
31 | import numpy as np
32 | import pandas as pd
33 | import tqdm
34 | 
35 | 
36 | def main(args):
37 |   users = []
38 |   ads = []
39 |   categories = []
40 |   brands = []
41 |   for day in tqdm.tqdm(range(args.ndays)):
42 |     fname = args.fname_template.format(day)
43 |     click_log = pd.read_parquet(fname)
44 |     users += pd.unique(click_log['user']).tolist()
45 |     ads += pd.unique(click_log['ad']).tolist()
46 |     categories += pd.unique(click_log['item_category']).tolist()
47 |     brands += pd.unique(click_log['item_brand']).tolist()
48 |     del click_log
49 |   users = np.unique(users)
50 |   logging.info('#users = %d', len(users))
51 |   del users
52 |   ads = np.unique(ads)
53 |   logging.info('#ads = %d', len(ads))
54 |   del ads
55 |   categories = np.unique(categories)
56 |   logging.info('#categories = %d', len(categories))
57 |   del categories
58 |   brands = np.unique(brands)
59 |   logging.info('#brands = %d', len(brands))
60 |   del brands
61 | 
62 | 
63 | if __name__ == '__main__':
64 |   logging.basicConfig(level=logging.INFO)
65 | 
66 |   parser = argparse.ArgumentParser()
67 |   parser.add_argument('--ndays', type=int, default=8)
68 |   parser.add_argument('--fname-template', default='./day_{}.parquet')
69 |   main(parser.parse_args())
70 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/cast.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_
16 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_
17 | 
18 | #if HYBRIDBACKEND_TENSORFLOW
19 | 
20 | #include <tensorflow/core/framework/op_kernel.h>
21 | #include <tensorflow/core/framework/tensor.h>
22 | #include <tensorflow/core/framework/tensor_reference.h>
23 | #include <tensorflow/core/public/version.h>
24 | 
25 | #if GOOGLE_CUDA
26 | #include <cuda.h>
27 | #include <cuda_runtime.h>
28 | #include <tensorflow/core/common_runtime/gpu/gpu_event_mgr.h>
29 | #include <tensorflow/stream_executor/stream_executor.h>
30 | #endif
31 | 
32 | namespace tensorflow {
33 | 
34 | class OpKernelContext;
35 | 
36 | namespace hybridbackend {
37 | namespace functor {
38 | 
39 | #if GOOGLE_CUDA
40 | template <typename Tin, typename Tout>
41 | struct Cast {
42 |   void operator()(const Tensor& in, Tensor* out, OpKernelContext* ctx,
43 |                   cudaStream_t* stream);
44 | };
45 | 
46 | template <typename Tin, typename Tout>
47 | struct CastN {
48 |   void operator()(const std::vector<Tensor>& in, std::vector<Tensor>* out,
49 |                   OpKernelContext* ctx, cudaStream_t* stream);
50 |   void operator()(const std::vector<Tensor>& in, std::vector<Tensor*>* out,
51 |                   OpKernelContext* ctx, cudaStream_t* stream);
52 |   void operator()(const std::vector<Tensor*>& in, std::vector<Tensor*>* out,
53 |                   OpKernelContext* ctx, cudaStream_t* stream);
54 | };
55 | 
56 | #endif  // GOOGLE_CUDA
57 | 
58 | }  // namespace functor
59 | }  // namespace hybridbackend
60 | }  // namespace tensorflow
61 | 
62 | #endif  // HYBRIDBACKEND_TENSORFLOW
63 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_CAST_H_
64 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/embedding/deeprecev.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''DeepRec EV as embedding tables.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.ops import variable_scope as vs
24 | 
25 | from hybridbackend.tensorflow.embedding.sharding import \
26 |   ShardedEmbeddingWeightsRewriting
27 | 
28 | 
29 | class ShardedEmbeddingWeightsRewritingForDeepRecEV(
30 |     ShardedEmbeddingWeightsRewriting):  # pylint: disable=useless-object-inheritance
31 |   r'''Embedding lookup decorator for DeepRec EV.
32 |   '''
33 |   def __init__(self):
34 |     super().__init__()
35 |     self._prev_get_embedding_variable = None
36 | 
37 |   @property
38 |   def isdynamic(self):
39 |     r'''Whether embedding weights is dynamic.
40 |     '''
41 |     return True
42 | 
43 |   def begin(self):
44 |     r'''Rewrites API.
45 |     '''
46 |     try:
47 |       self._prev_get_embedding_variable = (
48 |         vs.VariableScope.get_embedding_variable)  # pylint: disable=protected-access
49 |       vs.VariableScope.get_embedding_variable = (  # pylint: disable=protected-access
50 |         self.wraps_build_embedding_weights(self._prev_get_embedding_variable))
51 |     except:  # pylint: disable=bare-except
52 |       pass
53 | 
54 |   def end(self):
55 |     r'''Revert API rewriting.
56 |     '''
57 |     try:
58 |       vs.VariableScope.get_embedding_variable = (  # pylint: disable=protected-access
59 |         self._prev_get_embedding_variable)
60 |     except:  # pylint: disable=bare-except
61 |       pass
62 | 
63 | 
64 | ShardedEmbeddingWeightsRewriting.register(
65 |   ShardedEmbeddingWeightsRewritingForDeepRecEV)
66 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/rebatch/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Dataset that resizes batches of DataFrame values.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import inspect
24 | 
25 | # pylint: disable=ungrouped-imports
26 | from hybridbackend.tensorflow.data.dataframe import input_fields
27 | 
28 | try:
29 |   from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset  # pylint: disable=unused-import, line-too-long # noqa: F401
30 | 
31 |   from hybridbackend.tensorflow.data.rebatch.dataset_v2 import \
32 |     RebatchDatasetV2 as RebatchDataset
33 |   if inspect.isabstract(RebatchDataset):
34 |     raise ImportError
35 |   RebatchDataset.__module__ = __name__
36 |   RebatchDataset.__name__ = 'RebatchDataset'
37 | except ImportError:
38 |   from hybridbackend.tensorflow.data.rebatch.dataset_v1 import \
39 |     RebatchDatasetV1 as RebatchDataset
40 |   RebatchDataset.__module__ = __name__
41 |   RebatchDataset.__name__ = 'RebatchDataset'
42 |   assert not inspect.isabstract(RebatchDataset)
43 | # pylint: enable=ungrouped-imports
44 | 
45 | 
46 | def rebatch(
47 |     batch_size,
48 |     drop_remainder=False,
49 |     fields=None):
50 |   r'''Create a `RebatchDataset`.
51 | 
52 |   Args:
53 |     batch_size: Maxium number of samples in an output batch.
54 |     drop_remainder: (Optional.) If True, smaller final batch is dropped.
55 |       `False` by default.
56 |     fields: (Optional.) List of DataFrame fields. Fetched from `input_dataset`
57 |       by default.
58 |   '''
59 |   def _apply_fn(dataset):
60 |     return RebatchDataset(
61 |       dataset, input_fields(dataset, fields), batch_size,
62 |       drop_remainder=drop_remainder)
63 |   return _apply_fn
64 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/sync/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''SyncReplicasDataset that reports the existence of next element.
17 | 
18 | This class is compatible with Tensorflow 1.12.
19 | '''
20 | 
21 | from tensorflow.python.framework import sparse_tensor
22 | from tensorflow.python.framework import tensor_spec
23 | from tensorflow.python.util import nest
24 | 
25 | from hybridbackend.tensorflow.framework.ops import TensorKinds
26 | 
27 | 
28 | def normalize(input_dataset):
29 |   r'''flattent to normalize tensors within the input_dataset.
30 |   '''
31 |   flattened_specs = nest.flatten(input_dataset.element_spec)
32 |   flattened_kinds = []
33 |   for spec in flattened_specs:
34 |     if isinstance(spec, tensor_spec.TensorSpec):
35 |       flattened_kinds.append(TensorKinds.VALUES)
36 |     elif isinstance(spec, sparse_tensor.SparseTensorSpec):
37 |       flattened_kinds.append(
38 |         sparse_tensor.SparseTensorValue(
39 |           TensorKinds.INDICES, TensorKinds.VALUES, TensorKinds.DENSE_SHAPE))
40 |     else:
41 |       raise ValueError(
42 |         'SyncReplicasDataset cannot support input datasets with outputs '
43 |         'other than tensors or sparse tensors')
44 |   return input_dataset.map(TensorKinds.normalize),\
45 |     nest.flatten(flattened_kinds), flattened_kinds
46 | 
47 | 
48 | def denormalize(input_dataset, element_spec, kinds, hook=None):
49 |   r'''denormalize all tensors returned by input_dataset.
50 |   '''
51 |   if hook is None:
52 |     return input_dataset.map(
53 |       lambda *args: TensorKinds.denormalize(
54 |         element_spec, [TensorKinds.VALUES] + kinds, args))
55 |   input_dataset = input_dataset.map(hook.register)
56 |   return input_dataset.map(
57 |     lambda *args: TensorKinds.denormalize(
58 |       element_spec, kinds, args))
59 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Configuration file for the Sphinx documentation builder.
17 | 
18 | This file only contains a selection of the most common options. For a full
19 | list see the documentation:
20 | https://www.sphinx-doc.org/en/master/usage/configuration.html
21 | '''
22 | 
23 | # -- Project information -----------------------------------------------------
24 | project = 'HybridBackend'
25 | author = 'Alibaba Group Holding Limited'
26 | copyright = '2021 Alibaba Group Holding Limited'  # pylint: disable=redefined-builtin
27 | release = 'latest'
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 |   'myst_parser',
36 |   'sphinx.ext.autodoc',
37 |   'sphinx.ext.napoleon',
38 |   'sphinx.ext.autosectionlabel',
39 |   'sphinx.ext.autosummary',
40 |   'sphinx.ext.extlinks',
41 |   'sphinx.ext.mathjax',
42 |   'sphinx.ext.todo',
43 |   'sphinx.ext.ifconfig'
44 | ]
45 | 
46 | # Add any paths that contain templates here, relative to this directory.
47 | templates_path = ['_templates']
48 | 
49 | # List of patterns, relative to source directory, that match files and
50 | # directories to ignore when looking for source files.
51 | # This pattern also affects html_static_path and html_extra_path.
52 | exclude_patterns = []
53 | 
54 | # -- Options for HTML output -------------------------------------------------
55 | 
56 | # The theme to use for HTML and HTML Help pages.  See the documentation for
57 | # a list of builtin themes.
58 | #
59 | html_theme = 'sphinx_rtd_theme'
60 | html_static_path = ['images']
61 | 
62 | suppress_warnings = [
63 |   'autosectionlabel.*'
64 | ]
65 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tabular/table.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include <absl/strings/str_cat.h>
17 | #include <memory>
18 | #include <type_traits>
19 | #include <vector>
20 | 
21 | #include <tensorflow/core/framework/op.h>
22 | #include <tensorflow/core/framework/op_def_builder.h>
23 | #include <tensorflow/core/framework/op_kernel.h>
24 | #include <tensorflow/core/framework/types.h>
25 | 
26 | #include <unordered_set>
27 | 
28 | #include "hybridbackend/tensorflow/data/tabular/orc.h"
29 | #include "hybridbackend/tensorflow/data/tabular/parquet.h"
30 | 
31 | namespace tensorflow {
32 | namespace hybridbackend {
33 | 
34 | TableAccess* TableAccess::Create(
35 |     OpKernelContext* ctx, const TableFormat& format, const string& filename,
36 |     const int64 batch_size, const std::vector<string>& field_names,
37 |     const DataTypeVector& field_dtypes,
38 |     const std::vector<int32>& field_ragged_ranks,
39 |     const std::vector<PartialTensorShape>& field_shapes,
40 |     const bool drop_remainder, const bool skip_corrupted_data) {
41 |   switch (format) {
42 |     case kParquetFormat:
43 |       return new ParquetAccess(ctx, format, filename, batch_size, field_names,
44 |                                field_dtypes, field_ragged_ranks, field_shapes,
45 |                                drop_remainder, skip_corrupted_data);
46 |       break;
47 |     case kOrcFormat:
48 |       return new OrcAccess(ctx, format, filename, batch_size, field_names,
49 |                            field_dtypes, field_ragged_ranks, field_shapes,
50 |                            drop_remainder, skip_corrupted_data);
51 |       break;
52 |     default:
53 |       LOG(ERROR) << "File format " << format << " is not supported";
54 |       return nullptr;
55 |   }
56 |   return nullptr;
57 | }
58 | 
59 | }  // namespace hybridbackend
60 | }  // namespace tensorflow
61 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/framework/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''ConfigProto related functions.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.core.protobuf import config_pb2
24 | from tensorflow.python.distribute import multi_worker_util
25 | 
26 | from hybridbackend.tensorflow.framework.context import Context
27 | 
28 | 
29 | def wraps_session_config(session_config, *args, **kwargs):
30 |   r'''Wraps ConfigProto for distributed training.
31 |   '''
32 |   if not session_config:
33 |     kwargs.setdefault('allow_soft_placement', True)
34 |     session_config = config_pb2.ConfigProto(*args, **kwargs)
35 |   session_config.gpu_options.allow_growth = True
36 |   session_config.gpu_options.force_gpu_compatible = True
37 |   if not session_config.device_filters:
38 |     cluster_spec = Context.get().cluster_spec
39 |     task_type = Context.get().task_type
40 |     task_id = Context.get().task_id
41 |     if cluster_spec is None:
42 |       session_config.isolate_session_state = True
43 |       return session_config
44 |     session_config.isolate_session_state = False
45 |     del session_config.device_filters[:]
46 |     if task_type in ('chief', 'worker'):
47 |       session_config.device_filters.extend([
48 |         '/job:ps', '/job:chief', f'/job:{task_type}/task:{task_id}'])
49 |       session_config.experimental.collective_group_leader = (
50 |         multi_worker_util.collective_leader(cluster_spec, task_type, task_id))
51 |     elif task_type == 'evaluator':
52 |       session_config.device_filters.append(f'/job:{task_type}/task:{task_id}')
53 |   return session_config
54 | 
55 | 
56 | def get_session_config(*args, **kwargs):
57 |   r'''Creates ConfigProto for distributed training.
58 |   '''
59 |   return wraps_session_config(None, *args, **kwargs)
60 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/packing.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <map>
22 | #include <string>
23 | #include <vector>
24 | 
25 | #include <tensorflow/core/graph/graph.h>
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | class Pack {
31 |  public:
32 |   Pack(const string& op_type, const string& optimized_op_type);
33 |   Pack& WithDevice(const string& device);
34 |   Pack& WithTypeAttr(const string& attr_name,
35 |                      const std::vector<DataType>& constraints);
36 |   Pack& WithShapeAttr(const string& attr_name);
37 |   Pack& WithIntAttr(const string& attr_name);
38 |   Pack& WithStrAttr(const string& attr_name);
39 |   Pack& WithAggregatedShapeAttr(const string& attr_name);
40 |   Pack& WithAggregatedIntAttr(const string& attr_name);
41 |   Pack& WithAggregatedStrAttr(const string& attr_name);
42 |   Pack& WithHandle(const int32 input);
43 |   Pack& WithBuckets(const int32 num_buckets);
44 |   Status In(Graph* graph);
45 |   Status In(Graph* graph, int64* poccurrence_count);
46 | 
47 |  private:
48 |   string op_type_;
49 |   string optimized_op_type_;
50 |   string device_;
51 |   std::map<string, std::vector<DataType>> type_attrs_;
52 |   std::vector<string> shape_attrs_;
53 |   std::vector<string> int_attrs_;
54 |   std::vector<string> str_attrs_;
55 |   std::vector<string> aggregated_shape_attrs_;
56 |   std::vector<string> aggregated_int_attrs_;
57 |   std::vector<string> aggregated_str_attrs_;
58 |   std::vector<int32> handles_;
59 |   int num_buckets_;
60 | 
61 |   TF_DISALLOW_COPY_AND_ASSIGN(Pack);
62 | };
63 | 
64 | }  // namespace hybridbackend
65 | }  // namespace tensorflow
66 | 
67 | #endif  // HYBRIDBACKEND_TENSORFLOW
68 | #endif  // HYBRIDBACKEND_TENSORFLOW_GRAPH_COMMON_PACKING_H_
69 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/nccl/nccl_get_id.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <tensorflow/core/framework/common_shape_fns.h>
19 | #include <tensorflow/core/framework/op_kernel.h>
20 | #include <tensorflow/core/framework/shape_inference.h>
21 | 
22 | #include <vector>
23 | 
24 | #include "hybridbackend/tensorflow/distribute/nccl/collective.h"
25 | 
26 | namespace tensorflow {
27 | namespace hybridbackend {
28 | 
29 | #if HYBRIDBACKEND_NCCL
30 | 
31 | namespace {
32 | const int64 kNcclIdElements = NCCL_UNIQUE_ID_BYTES / sizeof(int64);
33 | }  // anonymous namespace
34 | 
35 | REGISTER_OP("HbGetNcclId")
36 |     .Output("id: int64")
37 |     .SetShapeFn([](shape_inference::InferenceContext* c) {
38 |       c->set_output(0, c->Vector(kNcclIdElements));
39 |       return Status::OK();
40 |     })
41 |     .SetIsStateful()
42 |     .Doc(R"doc(
43 | Get ID of the NCCL communciator.
44 | 
45 | id: Unique ID of the NCCL communicator.
46 | )doc");
47 | 
48 | #if GOOGLE_CUDA
49 | class GetNcclIdOp : public OpKernel {
50 |  public:
51 |   GetNcclIdOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
52 | 
53 |   void Compute(OpKernelContext* ctx) override {
54 |     static_assert(NCCL_UNIQUE_ID_BYTES % sizeof(int64) == 0, "Unexpected");
55 |     Tensor* id;
56 |     OP_REQUIRES_OK(
57 |         ctx, ctx->allocate_output(0, TensorShape({kNcclIdElements}), &id));
58 |     ncclUniqueId nccl_id;
59 |     ncclGetUniqueId(&nccl_id);
60 |     std::memcpy(reinterpret_cast<char*>(id->flat<int64>().data()),
61 |                 nccl_id.internal, NCCL_UNIQUE_ID_BYTES);
62 |   }
63 | };
64 | 
65 | REGISTER_KERNEL_BUILDER(Name("HbGetNcclId").Device(DEVICE_GPU).HostMemory("id"),
66 |                         GetNcclIdOp);
67 | REGISTER_KERNEL_BUILDER(Name("HbGetNcclId").Device(DEVICE_CPU), GetNcclIdOp);
68 | #endif
69 | 
70 | #endif
71 | 
72 | }  // namespace hybridbackend
73 | }  // namespace tensorflow
74 | 
75 | #endif  // HYBRIDBACKEND_TENSORFLOW
76 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/distribute/tests/broadcast_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Tests for broadcast collective communication.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os
24 | import unittest
25 | 
26 | import numpy as np
27 | 
28 | import hybridbackend.common.test as hbtest
29 | 
30 | # pylint: disable=missing-docstring,import-outside-toplevel
31 | 
32 | 
33 | def _test_broadcast(rank, a, b):
34 |   r'''Test Broadcast.
35 |   '''
36 |   import tensorflow as tf
37 | 
38 |   import hybridbackend.tensorflow as hb
39 | 
40 |   with tf.Graph().as_default():
41 |     with hb.scope():
42 |       data = tf.constant(a) if rank == 0 else tf.constant(b)
43 |       recv = hb.distribute.broadcast(data, root_rank=0)
44 |       with tf.train.MonitoredTrainingSession('') as sess:
45 |         return sess.run(recv)
46 | 
47 | 
48 | @unittest.skipUnless(
49 |   os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON', 'GPU required')
50 | @unittest.skipUnless(
51 |   os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required')
52 | class BroadcastTest(unittest.TestCase):
53 |   def setUp(self):  # pylint: disable=invalid-name
54 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
55 |     os.environ['NCCL_DEBUG'] = 'INFO'
56 |     os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL'
57 |     os.environ['TF_CPP_VMODULE'] = (
58 |       'nccl_comm=1,'
59 |       'nccl_create=1,'
60 |       'nccl_broadcast=1')
61 | 
62 |   def tearDown(self):  # pylint: disable=invalid-name
63 |     del os.environ['TF_CPP_VMODULE']
64 |     del os.environ['CUDA_VISIBLE_DEVICES']
65 | 
66 |   def test_broadcast(self):
67 |     a = 13
68 |     b = 22
69 |     results = hbtest.Spawn(2)(lambda rank: _test_broadcast(rank, a, b))
70 |     np.testing.assert_allclose(results[0], results[1], rtol=1e-6)
71 |     np.testing.assert_allclose(results[0], a, rtol=1e-6)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |   hbtest.main(f'{__file__}.xml')
76 | 


--------------------------------------------------------------------------------
/hybridbackend/common/atomic.cu.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_ATOMIC_CU_H_
17 | #define HYBRIDBACKEND_COMMON_ATOMIC_CU_H_
18 | 
19 | __forceinline__ __device__ long atomicAdd(long* address, long val) {
20 |   return (long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
21 | }
22 | 
23 | __forceinline__ __device__ long long atomicAdd(long long* address,
24 |                                                long long val) {
25 |   return (long long)atomicAdd((unsigned long long*)address,
26 |                               (unsigned long long)val);
27 | }
28 | 
29 | __forceinline__ __device__ unsigned long atomicAdd(unsigned long* address,
30 |                                                    unsigned long val) {
31 |   return (unsigned long)atomicAdd((unsigned long long*)address,
32 |                                   (unsigned long long)val);
33 | }
34 | 
35 | __forceinline__ __device__ long atomicCAS(long* address, long compare,
36 |                                           long val) {
37 |   return (long)atomicCAS((unsigned long long*)address,
38 |                          (unsigned long long)compare, (unsigned long long)val);
39 | }
40 | 
41 | __forceinline__ __device__ long long atomicCAS(long long* address,
42 |                                                long long compare,
43 |                                                long long val) {
44 |   return (long long)atomicCAS((unsigned long long*)address,
45 |                               (unsigned long long)compare,
46 |                               (unsigned long long)val);
47 | }
48 | 
49 | __forceinline__ __device__ unsigned long atomicCAS(unsigned long* address,
50 |                                                    unsigned long compare,
51 |                                                    unsigned long val) {
52 |   return (unsigned long)atomicCAS((unsigned long long*)address,
53 |                                   (unsigned long long)compare,
54 |                                   (unsigned long long)val);
55 | }
56 | 
57 | #endif  // HYBRIDBACKEND_COMMON_ATOMIC_CU_H_
58 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/deduplicate/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Dataset that compresses DataFrame values.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.util import nest
24 | 
25 | from hybridbackend.tensorflow.data.dataframe import input_fields
26 | 
27 | 
28 | def deduplicate(
29 |     key_idx_field_names,
30 |     value_field_names,
31 |     fields=None):
32 |   r'''Deduplicate fields specified in `value_field_names`
33 |     by using specified fields in `key_field_names`.
34 | 
35 |   Args:
36 |     key_idx_field_names: A list of string as names of fields utilized to
37 |       recover the key fields.
38 |     value_field_names: A List of list of string as fields to be
39 |       deduplicated by key fields.
40 |     fields: (Optional) fields of dataset.
41 |   '''
42 |   def _apply_fn(dataset):
43 |     all_fields = input_fields(dataset, fields=fields)
44 |     all_field_names = nest.flatten({f.name: f.name for f in all_fields})
45 |     map_name_to_fields = {f.name: f for f in all_fields}
46 | 
47 |     for key_idx_field_name in key_idx_field_names:
48 |       if key_idx_field_name not in all_field_names:
49 |         raise ValueError(
50 |           f'Key idx Field {key_idx_field_name} must be within the Fields')
51 | 
52 |     if len(value_field_names) != len(key_idx_field_names):
53 |       raise ValueError(
54 |         'Value field names must have the same length as key idx field names')
55 | 
56 |     key_idx_field_to_value_fields = {}
57 |     for i, name in enumerate(key_idx_field_names):
58 |       key_idx_field_to_value_fields[name] = value_field_names[i]
59 | 
60 |     for k, v_list in key_idx_field_to_value_fields.items():
61 |       for v in v_list:
62 |         if v not in all_field_names:
63 |           raise ValueError(
64 |             f'Value Field {v} must be within the Fields')
65 |         map_name_to_fields[v].set_restore_idx_field(map_name_to_fields[k])
66 |     return dataset
67 |   return _apply_fn
68 | 


--------------------------------------------------------------------------------
/hybridbackend/common/murmur3.cu.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 
15 | MurmurHash3 was written by Austin Appleby, and is placed in the public
16 | domain. The author hereby disclaims copyright to this source code.
17 | Note - The x86 and x64 versions do _not_ produce the same results, as the
18 | algorithms are optimized for their respective platforms. You can still
19 | compile and run any of them on any platform, but your performance with the
20 | non-native version will be less than optimal.
21 | 
22 | See https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp.
23 | ==============================================================================*/
24 | 
25 | #ifndef HYBRIDBACKEND_COMMON_MURMUR3_CU_H_
26 | #define HYBRIDBACKEND_COMMON_MURMUR3_CU_H_
27 | 
28 | inline __host__ __device__ uint32_t _rotl32(uint32_t x, int8_t r) {
29 |   return (x << r) | (x >> (32 - r));
30 | }
31 | 
32 | template <typename T, uint32_t seed = 0>
33 | inline __host__ __device__ uint32_t murmur3_hash32(const T& input) {
34 |   constexpr int len = sizeof(T);
35 |   const uint8_t* const data = (const uint8_t*)&input;
36 |   constexpr int nblocks = len / 4;
37 |   uint32_t h1 = seed;
38 |   constexpr uint32_t c1 = 0xcc9e2d51;
39 |   constexpr uint32_t c2 = 0x1b873593;
40 | 
41 |   // body
42 |   const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
43 |   for (int i = -nblocks; i; i++) {
44 |     uint32_t k1 = blocks[i];
45 |     k1 *= c1;
46 |     k1 = _rotl32(k1, 15);
47 |     k1 *= c2;
48 |     h1 ^= k1;
49 |     h1 = _rotl32(h1, 13);
50 |     h1 = h1 * 5 + 0xe6546b64;
51 |   }
52 | 
53 |   // tail
54 |   const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
55 |   uint32_t k1 = 0;
56 |   switch (len & 3) {
57 |     case 3:
58 |       k1 ^= tail[2] << 16;
59 |     case 2:
60 |       k1 ^= tail[1] << 8;
61 |     case 1:
62 |       k1 ^= tail[0];
63 |       k1 *= c1;
64 |       k1 = _rotl32(k1, 15);
65 |       k1 *= c2;
66 |       h1 ^= k1;
67 |   }
68 | 
69 |   // finalization
70 |   h1 ^= len;
71 |   h1 ^= h1 >> 16;
72 |   h1 *= 0x85ebca6b;
73 |   h1 ^= h1 >> 13;
74 |   h1 *= 0xc2b2ae35;
75 |   h1 ^= h1 >> 16;
76 |   return h1;
77 | }
78 | 
79 | #endif  // HYBRIDBACKEND_COMMON_MURMUR3_CU_H_


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tests/rebatch_dataset_seq_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Parquet batch dataset rebatching test.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os
24 | import random
25 | from six.moves import xrange  # pylint: disable=redefined-builtin
26 | import tempfile
27 | import unittest
28 | 
29 | import numpy as np
30 | import pyarrow as pa
31 | import pyarrow.parquet as pq
32 | import tensorflow as tf
33 | 
34 | import hybridbackend.common.test as hbtest
35 | import hybridbackend.tensorflow as hb
36 | 
37 | 
38 | # pylint: disable=missing-docstring
39 | class ParquetDatasetSequenceRebatchTest(unittest.TestCase):
40 |   def setUp(self):  # pylint: disable=invalid-name
41 |     os.environ['CUDA_VISIBLE_DEVICES'] = ''
42 |     self._workspace = tempfile.mkdtemp()
43 |     self._filename = os.path.join(self._workspace, 'seqtest.parquet')
44 |     self._nrows = 1000
45 |     self._ncols = 10
46 |     self._data = {
47 |       'clicks': [
48 |         [random.randint(0, 100) for col in range(self._ncols)]
49 |         for row in range(self._nrows)]}
50 |     pq.write_table(pa.Table.from_pydict(self._data), self._filename)
51 | 
52 |   def tearDown(self):  # pylint: disable=invalid-name
53 |     os.remove(self._filename)
54 |     del os.environ['CUDA_VISIBLE_DEVICES']
55 | 
56 |   def test_ragged(self):
57 |     batch_size = 8
58 |     with tf.Graph().as_default() as graph:
59 |       ds = hb.data.ParquetDataset(self._filename, batch_size=batch_size)
60 |       ds = ds.apply(hb.data.rebatch(batch_size))
61 |       batch = tf.data.make_one_shot_iterator(ds).get_next()
62 | 
63 |     clicks = self._data['clicks']
64 |     with tf.Session(graph=graph) as sess:
65 |       for i in xrange(3):
66 |         actual = sess.run(batch['clicks'])
67 |         start_row = i * batch_size
68 |         end_row = (i + 1) * batch_size
69 |         expected = clicks[start_row:end_row]
70 |         expected_values = [v for sublist in expected for v in sublist]
71 |         np.testing.assert_equal(actual.values, expected_values)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |   hbtest.main(f'{__file__}.xml')
76 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/common/linearization.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <algorithm>
19 | #include <deque>
20 | #include <set>
21 | #include <unordered_map>
22 | #include <vector>
23 | 
24 | #include <absl/strings/str_cat.h>
25 | #include <absl/strings/str_join.h>
26 | #include <absl/strings/str_split.h>
27 | 
28 | #include <tensorflow/core/graph/node_builder.h>
29 | #include <tensorflow/core/public/version.h>
30 | #include <tensorflow/core/util/device_name_utils.h>
31 | 
32 | #include "hybridbackend/common/env.h"
33 | #include "hybridbackend/tensorflow/graph/common/helper.h"
34 | #include "hybridbackend/tensorflow/graph/common/linearization.h"
35 | 
36 | namespace tensorflow {
37 | namespace hybridbackend {
38 | LinearizeOutputs::LinearizeOutputs(const string& op_type,
39 |                                    const int32& op_output)
40 |     : op_type_(op_type), op_output_(op_output) {}
41 | 
42 | Status LinearizeOutputs::In(Graph* graph) {
43 |   std::unordered_map<Node*, int> candidates;
44 |   std::vector<bool> dependencies;
45 | 
46 |   std::vector<Node*> sorted;
47 |   GetReversePostOrder(*graph, &sorted, NodeComparatorName{});
48 | 
49 |   for (Node* node : graph->op_nodes()) {
50 |     if (!node->IsOp()) {
51 |       continue;
52 |     }
53 | 
54 |     if (node->type_string() != op_type_) {
55 |       continue;
56 |     }
57 | 
58 |     std::vector<Node*> linear_ops;
59 |     for (Node* n : sorted) {
60 |       for (const auto& edge : n->in_edges()) {
61 |         if (edge && !edge->IsControlEdge() && edge->src() == node &&
62 |             edge->src_output() == op_output_) {
63 |           linear_ops.push_back(edge->dst());
64 |           break;
65 |         }
66 |       }
67 |     }
68 | 
69 |     if (linear_ops.size() < 2) {
70 |       continue;
71 |     }
72 | 
73 |     for (size_t idx = 1; idx < linear_ops.size(); ++idx) {
74 |       graph->AddControlEdge(linear_ops[idx - 1], linear_ops[idx]);
75 |     }
76 | 
77 |     VLOG(1) << "Linearized " << linear_ops.size() << " outputs of "
78 |             << node->name() << " in graph " << static_cast<void*>(graph);
79 |   }
80 | 
81 |   return Status::OK();
82 | }
83 | 
84 | }  // namespace hybridbackend
85 | }  // namespace tensorflow
86 | 
87 | #endif  // HYBRIDBACKEND_TENSORFLOW
88 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/stream.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | #if GOOGLE_CUDA
21 | 
22 | #include <absl/strings/str_cat.h>
23 | 
24 | #include <tensorflow/core/framework/op_kernel.h>
25 | #include <tensorflow/core/framework/register_types.h>
26 | #include <tensorflow/core/framework/resource_mgr.h>
27 | #include <tensorflow/core/framework/tensor.h>
28 | #include <tensorflow/core/lib/core/errors.h>
29 | #include <tensorflow/core/lib/core/status.h>
30 | #include <tensorflow/core/public/version.h>
31 | 
32 | #include "hybridbackend/tensorflow/common/host_functions.h"
33 | 
34 | namespace tensorflow {
35 | namespace hybridbackend {
36 | 
37 | class Stream {
38 |  public:
39 |   Stream() : se_stream_(nullptr), stream_(nullptr) {}
40 |   virtual ~Stream() {}
41 |   cudaStream_t* get() const { return stream_; }
42 | 
43 |   static se::Event* TensorStreamCreateEvent(OpKernelContext* ctx);
44 | 
45 |   void Initialize(OpKernelContext* ctx);
46 |   void Initialize(OpKernelContext* ctx, const string& name,
47 |                   const int64 num_threads);
48 |   void Launch(OpKernelContext* ctx, std::function<void()> fn);
49 |   void LaunchUntilComputeDone(OpKernelContext* ctx, std::function<void()> fn);
50 | 
51 |   void BlockComputeUntilDone(OpKernelContext* ctx);
52 |   void BlockComputeUntilDone(OpKernelContext* ctx, std::function<void()> fn);
53 |   void BlockHostUntilDone();
54 | 
55 |   Stream& ThenWaitUntilComputeDone(OpKernelContext* ctx);
56 |   Stream& ThenExecute(OpKernelContext* ctx, std::function<void()> fn);
57 |   Stream& ThenMemcpy(void* dst, const se::DeviceMemoryBase& src, uint64 size);
58 |   Stream& ThenMemcpy(se::DeviceMemoryBase* dst, const void* src, uint64 size);
59 |   Stream& ThenMemcpy(se::DeviceMemoryBase* dst, const se::DeviceMemoryBase& src,
60 |                      uint64 size);
61 | 
62 |  private:
63 |   std::unique_ptr<thread::ThreadPool> threads_;
64 |   se::Stream* se_stream_;
65 |   cudaStream_t* stream_;
66 |   std::mutex mu_;
67 | };
68 | }  // namespace hybridbackend
69 | }  // namespace tensorflow
70 | 
71 | #endif  // GOOGLE_CUDA
72 | #endif  // HYBRIDBACKEND_TENSORFLOW
73 | 
74 | #endif  // HYBRIDBACKEND_TENSORFLOW_COMMON_STREAM_H_
75 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/common/fusion_helper.cu.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #if GOOGLE_CUDA
19 | #define EIGEN_USE_GPU
20 | 
21 | #include <thrust/device_ptr.h>
22 | #include <thrust/device_vector.h>
23 | 
24 | #include <tensorflow/core/framework/register_types.h>
25 | #include <tensorflow/core/framework/tensor.h>
26 | #include <tensorflow/core/public/version.h>
27 | 
28 | #include "hybridbackend/tensorflow/common/device_functions.h"
29 | #include "hybridbackend/tensorflow/common/fusion_helper.cu.h"
30 | 
31 | namespace tensorflow {
32 | namespace hybridbackend {
33 | namespace functor {
34 | 
35 | template <typename T>
36 | void CopyPtrsNFunctor<T>::operator()(OpKernelContext* ctx, int8* head_host,
37 |                                      int8* head_device,
38 |                                      std::vector<const Tensor*>* inputs,
39 |                                      int num_columns) {
40 |   T** head_host_ptr = reinterpret_cast<T**>(head_host);
41 |   for (int i = 0; i < num_columns; ++i) {
42 |     head_host_ptr[i] =
43 |         const_cast<T*>((*inputs)[i]->flat_outer_dims<T>().data());
44 |   }
45 |   auto* stream = ctx->op_device_context()->stream();
46 |   se::DeviceMemoryBase dst_ptr(head_device, num_columns * sizeof(T*));
47 |   stream->ThenMemcpy(&dst_ptr, head_host, num_columns * sizeof(T*));
48 |   stream->BlockHostUntilDone();
49 | }
50 | 
51 | template <typename T>
52 | void CopySizesNFunctor<T>::operator()(OpKernelContext* ctx, T* input_host,
53 |                                       T* input_device, int num_columns) {
54 |   auto* stream = ctx->op_device_context()->stream();
55 |   se::DeviceMemoryBase dst_ptr(input_device, num_columns * sizeof(T));
56 |   stream->ThenMemcpy(&dst_ptr, input_host, num_columns * sizeof(T));
57 |   stream->BlockHostUntilDone();
58 | }
59 | 
60 | #define DEFINE_COPY_PTRS(T) template struct CopyPtrsNFunctor<T>;
61 | #define DEFINE_COPY_SIZES(T) template struct CopySizesNFunctor<T>;
62 | 
63 | #define TF_CALL_HELPER_TYPES(m) \
64 |   TF_CALL_uint32(m) TF_CALL_uint64(m) TF_CALL_REAL_NUMBER_TYPES(m)
65 | 
66 | TF_CALL_HELPER_TYPES(DEFINE_COPY_PTRS);
67 | TF_CALL_HELPER_TYPES(DEFINE_COPY_SIZES);
68 | 
69 | }  // namespace functor
70 | }  // namespace hybridbackend
71 | }  // namespace tensorflow
72 | 
73 | #endif  // GOOGLE_CUDA
74 | #endif  // HYBRIDBACKEND_TENSORFLOW
75 | 


--------------------------------------------------------------------------------
/hybridbackend/common/arrow.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_COMMON_ARROW_H_
17 | #define HYBRIDBACKEND_COMMON_ARROW_H_
18 | 
19 | #include <deque>
20 | #include <string>
21 | 
22 | #if HYBRIDBACKEND_ARROW
23 | #include <arrow/adapters/orc/adapter.h>
24 | #include <arrow/dataset/api.h>
25 | #include <arrow/record_batch.h>
26 | #include <parquet/arrow/reader.h>
27 | #include <parquet/properties.h>
28 | 
29 | #if HYBRIDBACKEND_ARROW_HDFS
30 | #include <arrow/filesystem/hdfs.h>
31 | #endif
32 | #if HYBRIDBACKEND_ARROW_S3
33 | #include <arrow/filesystem/s3fs.h>
34 | #endif
35 | #include <arrow/filesystem/localfs.h>
36 | 
37 | namespace hybridbackend {
38 | ::arrow::Status OpenArrowFile(
39 |     std::shared_ptr<::arrow::fs::FileSystem>* fs,
40 |     std::shared_ptr<::arrow::io::RandomAccessFile>* file,
41 |     const std::string& filename);
42 | 
43 | void CloseArrowFile(std::shared_ptr<::arrow::fs::FileSystem>& fs,
44 |                     std::shared_ptr<::arrow::io::RandomAccessFile>& file,
45 |                     const std::string& filename);
46 | 
47 | ::arrow::Status OpenParquetReader(
48 |     std::unique_ptr<::parquet::arrow::FileReader>* reader,
49 |     const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
50 |     const bool initialized_from_env);
51 | 
52 | ::arrow::Status GetParquetDataFrameFields(
53 |     std::vector<std::string>* field_names,
54 |     std::vector<std::string>* field_dtypes,
55 |     std::vector<int>* field_ragged_ranks, const std::string& filename);
56 | 
57 | ::arrow::Status GetParquetRowGroupCount(int* row_group_count,
58 |                                         const std::string& filename);
59 | 
60 | ::arrow::Status OpenOrcReader(
61 |     std::unique_ptr<::arrow::adapters::orc::ORCFileReader>* reader,
62 |     const std::shared_ptr<::arrow::io::RandomAccessFile>& file,
63 |     const bool initialized_from_env);
64 | 
65 | ::arrow::Status GetOrcDataFrameFields(std::vector<std::string>* field_names,
66 |                                       std::vector<std::string>* field_dtypes,
67 |                                       std::vector<int>* field_ragged_ranks,
68 |                                       const std::string& filename);
69 | 
70 | ::arrow::Status GetOrcRowCount(int* row_count, const std::string& filename);
71 | 
72 | }  // namespace hybridbackend
73 | 
74 | #endif  // HYBRIDBACKEND_ARROW
75 | #endif  // HYBRIDBACKEND_COMMON_ARROW_H_
76 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/metrics/gauc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''A data-parallel gAUC metric.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.framework import dtypes
24 | from tensorflow.python.ops import array_ops
25 | from tensorflow.python.ops import math_ops
26 | from tensorflow.python.ops import variable_scope as vs
27 | 
28 | from hybridbackend.tensorflow.common import oplib as _ops
29 | from hybridbackend.tensorflow.metrics.mean import mean
30 | 
31 | 
32 | def gauc(labels,
33 |          predictions,
34 |          indicators=None,
35 |          metrics_collections=None,
36 |          updates_collections=None,
37 |          name=None):
38 |   r'''Computes the approximate gAUC.
39 | 
40 |   Args:
41 |     labels: A `Tensor` whose shape matches `predictions`. Will be cast to
42 |       `bool`.
43 |     predictions: A floating point `Tensor` of arbitrary shape and whose values
44 |       are in the range `[0, 1]`.
45 |     indicators: A `Tensor` whose shape matches `predictions`.
46 |     metrics_collections: An optional list of collections that `mean`
47 |       should be added to.
48 |     updates_collections: An optional list of collections that `update_op`
49 |       should be added to.
50 |     name: An optional variable_scope name.
51 | 
52 |   Returns:
53 |     (gauc, update_op): A tuple of a scalar `Tensor` representing the current
54 |       g-area-under-curve and an operation that increments the `true_positives`,
55 |       `true_negatives`, `false_positives` and `false_negatives` variables
56 |       appropriately and whose value matches `auc`.
57 | 
58 |   Raises:
59 |     ValueError: If `predictions` and `labels` have mismatched shapes, or if
60 |       `weights` is not `None` and its shape doesn't match `predictions`, or if
61 |       either `metrics_collections` or `updates_collections` are not a list or
62 |       tuple.
63 |     RuntimeError: If eager execution is enabled.
64 |   '''
65 |   if indicators is None:
66 |     indicators = math_ops.range(
67 |       0, array_ops.shape(array_ops.reshape(labels, [-1]))[0],
68 |       dtype=dtypes.int32)
69 |   with vs.variable_scope(name, 'gauc', (labels, predictions, indicators)):
70 |     aucs, counts = _ops.hb_gauc_calc(labels, predictions, indicators)
71 |     return mean(aucs, counts, metrics_collections, updates_collections, name)
72 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/training/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Support for training models in hybridbackend.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.training import training as _training
24 | 
25 | from hybridbackend.tensorflow.framework.context import Context as _ctx
26 | from hybridbackend.tensorflow.framework.ops import ModeKeys as _mode_keys
27 | from hybridbackend.tensorflow.framework.rewriting import GraphRewriting
28 | from hybridbackend.tensorflow.framework.rewriting import SessionRunRewriting
29 | from hybridbackend.tensorflow.training.evaluation import EvaluationHook
30 | from hybridbackend.tensorflow.training.evaluation import EvaluationSpec
31 | from hybridbackend.tensorflow.training.hooks import Policy
32 | from hybridbackend.tensorflow.training.hooks import StepStatHook
33 | from hybridbackend.tensorflow.training.optimizer import SyncReplicasOptimizer
34 | from hybridbackend.tensorflow.training.optimizer import \
35 |   wraps_optimizer as _wraps
36 | from hybridbackend.tensorflow.training.saved_model import export
37 | from hybridbackend.tensorflow.training.saved_model import export_all
38 | from hybridbackend.tensorflow.training.saver import replace_default_saver
39 | from hybridbackend.tensorflow.training.saver import Saver
40 | from hybridbackend.tensorflow.training.server import monitored_session
41 | from hybridbackend.tensorflow.training.server import Server
42 | from hybridbackend.tensorflow.training.server import target
43 | from hybridbackend.tensorflow.training.server import wraps_server
44 | from hybridbackend.tensorflow.training.session import \
45 |   wraps_monitored_training_session
46 | 
47 | _ = (
48 |   _ctx.get().options
49 |   .register('grad_lazy_sync', False, env='HB_GRAD_LAZY_SYNC')
50 |   .register('sharding', False)
51 |   .register(
52 |     'use_hierarchical_embedding_lookup', True,
53 |     env='HB_USE_HIERARCHICAL_EMBEDDING_LOOKUP')
54 |   .register('batch_size', -1)
55 |   .register('model_dir', None)
56 |   .register('keep_checkpoint_max', None)
57 |   .register('keep_checkpoint_every_n_hours', None)
58 |   .register('mode', _mode_keys.TRAIN))
59 | 
60 | 
61 | for c in _training.__dict__.values():
62 |   if (isinstance(c, type)
63 |       and issubclass(c, _training.Optimizer)
64 |       and c not in (_training.Optimizer, _training.SyncReplicasOptimizer)):
65 |     globals()[c.__name__] = _wraps(c)
66 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/embedding/tests/deeprecev_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Tests for embedding columns upon DeepRec EV.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os
24 | import unittest
25 | 
26 | import hybridbackend.common.test as hbtest
27 | 
28 | # pylint: disable=missing-docstring
29 | # pylint: disable=import-outside-toplevel
30 | 
31 | 
32 | def _test_get_embedding_variable(_):
33 |   import tensorflow as tf
34 | 
35 |   import hybridbackend.tensorflow as hb
36 | 
37 |   with tf.Graph().as_default():
38 |     with hb.scope():
39 |       with hb.embedding_scope():
40 |         with tf.device('/cpu:0'):
41 |           var = tf.get_embedding_variable(
42 |             'var_1',
43 |             embedding_dim=3,
44 |             initializer=tf.ones_initializer(tf.float32),
45 |             partitioner=tf.fixed_size_partitioner(num_shards=4))
46 |         emb = tf.nn.embedding_lookup(
47 |           var, tf.cast([0, 1, 2, 5, 6, -7], tf.int64))
48 |       fun = tf.multiply(emb, 2.0, name='multiply')
49 |       loss = tf.reduce_sum(fun, name='reduce_sum')
50 |       opt = tf.train.FtrlOptimizer(
51 |         0.1,
52 |         l1_regularization_strength=2.0,
53 |         l2_regularization_strength=0.00001)
54 |       g_v = opt.compute_gradients(loss)
55 |       train_op = opt.apply_gradients(g_v)
56 |       with tf.train.MonitoredTrainingSession('') as sess:
57 |         emb_result, loss_result, _ = sess.run([emb, loss, train_op])
58 |         return (emb_result, loss_result)
59 | 
60 | 
61 | @unittest.skipUnless(
62 |   (os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON'
63 |    and os.getenv('HYBRIDBACKEND_WITH_TENSORFLOW_DISTRO') == '99881015'),
64 |   'DeepRec on GPU required')
65 | @unittest.skipUnless(
66 |   os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required')
67 | class DeepRecEVTest(unittest.TestCase):
68 |   '''Tests for embedding column.
69 |   '''
70 |   def setUp(self):  # pylint: disable=invalid-name
71 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
72 | 
73 |   def test_get_embedding_variable(self):
74 |     results = hbtest.Spawn()(_test_get_embedding_variable)
75 |     print(results)
76 | 
77 |   def test_get_embedding_variable_2g(self):
78 |     results = hbtest.Spawn(2)(_test_get_embedding_variable)
79 |     print(results)
80 | 
81 | 
82 | # pylint: enable=missing-docstring
83 | if __name__ == '__main__':
84 |   hbtest.main(f'{__file__}.xml')
85 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/benchmarks/data_benchmark_csv.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Data reading for CSV files benchmark.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import argparse
24 | import os
25 | from six.moves import xrange  # pylint: disable=redefined-builtin
26 | import tempfile
27 | import time
28 | 
29 | import numpy as np
30 | import pandas as pd
31 | import tensorflow as tf
32 | 
33 | 
34 | # pylint: disable=missing-docstring
35 | def benchmark(params):
36 |   if not params.filenames:
37 |     tf.logging.info('Started generating mock file ...')
38 |     workspace = tempfile.mkdtemp()
39 |     params.filenames = [os.path.join(workspace, 'benchmark.csv')]
40 |     df = pd.DataFrame(
41 |       np.random.randint(
42 |         0, 100,
43 |         size=(params.batch_size * 100, len(params.fields)),
44 |         dtype=np.int64),
45 |       columns=params.fields)
46 |     df.to_csv(params.filenames[0], header=False, index=False)
47 |     tf.logging.info(f'Mock file {params.filenames[0]} generated.')
48 |   with tf.Graph().as_default():
49 |     step = tf.train.get_or_create_global_step()
50 |     ds = tf.data.TextLineDataset(params.filenames)
51 |     ds = ds.batch(params.batch_size, drop_remainder=True)
52 |     ds = ds.map(
53 |       lambda line: tf.io.decode_csv(
54 |         line, [[1 << 32] for f in params.fields]))
55 |     batch = tf.data.make_one_shot_iterator(ds).get_next()
56 |     train_op = tf.group(batch + [step.assign_add(1)])
57 |     with tf.train.MonitoredTrainingSession('') as sess:
58 |       count = 0
59 |       prev_ts = time.time()
60 |       try:
61 |         while not sess.should_stop():
62 |           sess.run(train_op)
63 |           count += 1
64 |       except tf.errors.OutOfRangeError:
65 |         pass
66 |       duration = time.time() - prev_ts
67 |       if count <= 0:
68 |         print('Reading CSV files stopped unexpectedly')
69 |         return
70 |       print(
71 |         'Reading CSV files elapsed in '
72 |         f'{params.batch_size * count / duration:.2f} samples/sec ('
73 |         f'{1000. * duration / count:.2f} msec/step)')
74 | 
75 | 
76 | if __name__ == '__main__':
77 |   os.environ['CUDA_VISIBLE_DEVICES'] = ''
78 |   tf.logging.set_verbosity(tf.logging.INFO)
79 |   parser = argparse.ArgumentParser()
80 |   parser.add_argument('--batch-size', type=int, default=64000)
81 |   parser.add_argument(
82 |     '--fields', nargs='+', default=[f'f{c}' for c in xrange(200)])
83 |   parser.add_argument('filenames', nargs='*')
84 |   benchmark(parser.parse_args())
85 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/ops/transfer/functors.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_
17 | #define HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_
18 | 
19 | #if HYBRIDBACKEND_TENSORFLOW
20 | 
21 | #include <tensorflow/core/framework/op_kernel.h>
22 | #include <tensorflow/core/framework/tensor.h>
23 | #include <tensorflow/core/framework/tensor_reference.h>
24 | #include <tensorflow/core/public/version.h>
25 | 
26 | namespace tensorflow {
27 | 
28 | class OpKernelContext;
29 | 
30 | namespace hybridbackend {
31 | namespace functor {
32 | 
33 | #define TF_CALL_TRANSFER_TYPES(m)                                         \
34 |   TF_CALL_int8(m) TF_CALL_uint8(m) TF_CALL_int32(m) TF_CALL_uint32(m)     \
35 |       TF_CALL_int64(m) TF_CALL_uint64(m) TF_CALL_half(m) TF_CALL_float(m) \
36 |           TF_CALL_double(m)
37 | #define TF_OP_TRANSFER_DTYPE_LIST \
38 |   "int8, uint8, int32, uint32, int64, uint64, half, float, double"
39 | 
40 | #if GOOGLE_CUDA
41 | 
42 | template <typename T>
43 | struct TransferH2DNFunctor {
44 |  public:
45 |   TransferH2DNFunctor(const OpInputList& inputs, OpOutputList& outputs,
46 |                       OpKernelContext* ctx);
47 |   virtual ~TransferH2DNFunctor();
48 | 
49 |   int64 num_pinned_inputs() const { return num_pinned_inputs_; }
50 |   int64 num_unpinned_inputs() const { return num_unpinned_inputs_; }
51 | 
52 |   int64 pinned_input_bytes() const { return pinned_input_bytes_; }
53 |   int64 unpinned_input_bytes() const { return unpinned_input_bytes_; }
54 | 
55 |   Status Copy(cudaStream_t* stream);
56 | 
57 |  private:
58 |   int64 num_unpinned_inputs_;
59 |   int64 unpinned_input_bytes_;
60 |   std::vector<char*> unpinned_outputs_;
61 |   std::vector<const void*> unpinned_inputs_;
62 |   std::vector<size_t> unpinned_bytes_;
63 |   Tensor* h_unpinned_fusion_buffer_tensor_;
64 |   std::vector<T*> unpinned_fusion_outputs_;
65 |   std::vector<const T*> unpinned_fusion_inputs_;
66 |   std::vector<size_t> unpinned_fusion_bytes_;
67 | 
68 |   int64 num_pinned_inputs_;
69 |   int64 pinned_input_bytes_;
70 |   int64 pinned_buffer_bytes_;
71 |   Tensor* h_pinned_buffer_tensor_;
72 |   Tensor* d_pinned_buffer_tensor_;
73 |   int8* h_pinned_buffer_;
74 |   int8* d_pinned_buffer_;
75 |   int8* d_pinned_input_raw_ptrs_;
76 |   int8* d_pinned_output_raw_ptrs_;
77 |   int64* d_pinned_input_sizes_;
78 |   int64* d_pinned_output_sizes_;
79 |   int64 max_pinned_output_size_;
80 |   int pinned_copy_block_size_;
81 | };
82 | 
83 | #endif  // GOOGLE_CUDA
84 | }  // namespace functor
85 | }  // namespace hybridbackend
86 | }  // namespace tensorflow
87 | 
88 | #endif  // HYBRIDBACKEND_TENSORFLOW
89 | #endif  // HYBRIDBACKEND_TENSORFLOW_OPS_TRANSFER_FUNCTORS_H_
90 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tests/parquet_dataset_reshape_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Parquet batch dataset ragged tensors test.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os
24 | from six.moves import xrange  # pylint: disable=redefined-builtin
25 | import tempfile
26 | import unittest
27 | 
28 | import numpy as np
29 | import pandas as pd
30 | import tensorflow as tf
31 | 
32 | import hybridbackend.common.test as hbtest
33 | import hybridbackend.tensorflow as hb
34 | 
35 | 
36 | # pylint: disable=missing-docstring
37 | class ParquetDatasetReshapeTest(unittest.TestCase):
38 |   def setUp(self):  # pylint: disable=invalid-name
39 |     os.environ['CUDA_VISIBLE_DEVICES'] = ''
40 |     self._workspace = tempfile.mkdtemp()
41 |     self._filename = os.path.join(self._workspace, 'reshape_test.parquet')
42 |     num_cols = 3
43 |     self._df = pd.DataFrame(
44 |       np.array([
45 |         [
46 |           np.random.randint(
47 |             0, 100,
48 |             size=(4,) if icol == 0 else (np.random.randint(1, 5),),
49 |             dtype=np.int64)
50 |           for icol in xrange(num_cols)]
51 |         for _ in xrange(100)], dtype=object),
52 |       columns=[f'col{c}' for c in xrange(num_cols)])
53 |     self._df.to_parquet(self._filename)
54 | 
55 |   def tearDown(self):  # pylint: disable=invalid-name
56 |     os.remove(self._filename)
57 |     del os.environ['CUDA_VISIBLE_DEVICES']
58 | 
59 |   def test_reshape(self):
60 |     batch_size = 32
61 |     with tf.Graph().as_default() as graph:
62 |       ds = hb.data.Dataset.from_parquet(
63 |         [self._filename],
64 |         fields=[
65 |           hb.data.DataFrame.Field('col2'),
66 |           hb.data.DataFrame.Field('col0', shape=[4])])
67 |       ds = ds.batch(batch_size)
68 |       ds = ds.prefetch(4)
69 |       batch = tf.data.make_one_shot_iterator(ds).get_next()
70 | 
71 |     c = self._df['col0']
72 |     with tf.Session(graph=graph) as sess:
73 |       for i in xrange(3):
74 |         result = sess.run(batch)
75 |         start_row = i * batch_size
76 |         end_row = (i + 1) * batch_size
77 |         expected_items = c[start_row:end_row].to_numpy().tolist()
78 |         expected_values = []
79 |         expected_splits = [0]
80 |         for item in expected_items:
81 |           expected_values.extend(item)
82 |           expected_splits.append(expected_splits[-1] + len(item))
83 |         expected = np.array(expected_values)
84 |         expected = np.reshape(expected, (batch_size, 4))
85 |         actual = result['col0']
86 |         np.testing.assert_allclose(actual, expected)
87 | 
88 | 
89 | if __name__ == '__main__':
90 |   hbtest.main(f'{__file__}.xml')
91 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/training/server.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Servers using hybrid parallelism.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.training import monitored_session as _monitored_session
24 | from tensorflow.python.training import server_lib
25 | 
26 | from hybridbackend.tensorflow.framework.config import wraps_session_config
27 | from hybridbackend.tensorflow.framework.context import Context
28 | from hybridbackend.tensorflow.framework.rewriting import scope
29 | 
30 | 
31 | class HybridBackendServerBase(object):  # pylint: disable=useless-object-inheritance
32 |   r'''Base class of server wrapper.
33 |   '''
34 | 
35 | 
36 | def wraps_server(cls):
37 |   r'''Decorator to create hybridbackend server class.
38 |   '''
39 |   if issubclass(cls, HybridBackendServerBase):
40 |     return cls
41 | 
42 |   class HybridBackendServer(cls, HybridBackendServerBase):
43 |     r'''An in-process TensorFlow server, for use in distributed training.
44 |     '''
45 |     _default = None
46 | 
47 |     @classmethod
48 |     def get(class_):
49 |       if class_._default is None:
50 |         class_._default = class_(None)
51 |       return class_._default
52 | 
53 |     def __init__(self, server_or_cluster_def, **kwargs):
54 |       r'''Creates a new server with the given definition.
55 |       '''
56 |       if server_or_cluster_def is None:
57 |         server_or_cluster_def = Context.get().cluster_spec
58 |         kwargs['job_name'] = Context.get().task_type
59 |         kwargs['task_index'] = Context.get().task_id
60 |       if server_or_cluster_def is None:
61 |         self._is_local = True
62 |         return
63 |       self._is_local = False
64 |       kwargs['config'] = wraps_session_config(kwargs.pop('config', None))
65 |       super().__init__(server_or_cluster_def, **kwargs)
66 | 
67 |     @property
68 |     def target(self):
69 |       r'''Returns the target for asession to connect to this server.
70 |       '''
71 |       if self._is_local:
72 |         return ''
73 |       return super().target
74 | 
75 |     def monitored_session(self, **kwargs):
76 |       r'''Creates a `MonitoredSession` for training.
77 |       '''
78 |       with scope():
79 |         return _monitored_session.MonitoredTrainingSession(
80 |           master=self.target, **kwargs)
81 | 
82 |   return HybridBackendServer
83 | 
84 | 
85 | Server = wraps_server(server_lib.Server)
86 | 
87 | 
88 | def monitored_session(**kwargs):
89 |   r'''Creates a `MonitoredSession` for training with default server.
90 |   '''
91 |   return Server.get().monitored_session(**kwargs)
92 | 
93 | 
94 | def target():
95 |   r'''HybridBackend server target.
96 |   '''
97 |   return Server.get().target
98 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/optimize_floormod_shuffle.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <vector>
19 | 
20 | #include "hybridbackend/common/env.h"
21 | #include "hybridbackend/tensorflow/graph/common/packing.h"
22 | #include "hybridbackend/tensorflow/graph/common/relocation.h"
23 | #include "hybridbackend/tensorflow/graph/common/replacing.h"
24 | #include "hybridbackend/tensorflow/graph/common/rewriting.h"
25 | #include "hybridbackend/tensorflow/graph/op_optimization.h"
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | namespace {
31 | inline bool FloormodShuffleOptimizationDisabled() {
32 |   static const bool kFloormodShuffleOptimizationDisabled =
33 |       ::hybridbackend::EnvVarGetBool(
34 |           "HB_OP_FLOORMOD_SHUFFLE_OPTIMIZATION_DISABLED", false);
35 |   return kFloormodShuffleOptimizationDisabled;
36 | }
37 | 
38 | inline bool FloormodShufflePackingDisabled() {
39 |   static const bool kFloormodShufflePackingDisabled =
40 |       ::hybridbackend::EnvVarGetBool("HB_OP_FLOORMOD_SHUFFLE_PACKING_DISABLED",
41 |                                      false);
42 |   return kFloormodShufflePackingDisabled;
43 | }
44 | 
45 | }  // namespace
46 | 
47 | class OptimizeFloormodShuffleReplacingPass : public OpOptimizationPass {
48 |  public:
49 |   Status Optimize(Graph* graph, const SessionOptions* options,
50 |                   const bool disabled) override {
51 |     if (TF_PREDICT_FALSE(disabled || FloormodShuffleOptimizationDisabled())) {
52 |       return Status::OK();
53 |     }
54 | 
55 |     TF_RETURN_IF_ERROR(Rewrite("FloormodShuffle", "HbFloormodShuffle")
56 |                            .WithIntAttr("num_partitions", 1)
57 |                            .In(graph));
58 | 
59 |     return Status::OK();
60 |   }
61 | };
62 | 
63 | REGISTER_REPLACING_OPTIMIZATION(OptimizeFloormodShuffleReplacingPass);
64 | 
65 | class OptimizeFloormodShuffleReductionPass : public OpOptimizationPass {
66 |  public:
67 |   Status Optimize(Graph* graph, const SessionOptions* options,
68 |                   const bool disabled) override {
69 |     if (TF_PREDICT_FALSE(disabled || FloormodShuffleOptimizationDisabled())) {
70 |       return Status::OK();
71 |     }
72 | 
73 |     if (TF_PREDICT_TRUE(!FloormodShufflePackingDisabled())) {
74 |       TF_RETURN_IF_ERROR(
75 |           Pack("HbFloormodShuffle", "HbFloormodShuffleN")
76 |               .WithTypeAttr("T", {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64})
77 |               .WithIntAttr("num_partitions")
78 |               .In(graph));
79 | 
80 |       return Status::OK();
81 |     }
82 | 
83 |     return Status::OK();
84 |   }
85 | };
86 | 
87 | REGISTER_REDUCTION_OPTIMIZATION(OptimizeFloormodShuffleReductionPass);
88 | }  // namespace hybridbackend
89 | }  // namespace tensorflow
90 | 
91 | #endif  // HYBRIDBACKEND_TENSORFLOW
92 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/graph/optimize_partition_by_modulo.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #if HYBRIDBACKEND_TENSORFLOW
17 | 
18 | #include <vector>
19 | 
20 | #include "hybridbackend/common/env.h"
21 | #include "hybridbackend/tensorflow/graph/common/packing.h"
22 | #include "hybridbackend/tensorflow/graph/common/relocation.h"
23 | #include "hybridbackend/tensorflow/graph/common/replacing.h"
24 | #include "hybridbackend/tensorflow/graph/common/rewriting.h"
25 | #include "hybridbackend/tensorflow/graph/op_optimization.h"
26 | 
27 | namespace tensorflow {
28 | namespace hybridbackend {
29 | 
30 | namespace {
31 | inline bool PartitionByModuloOptimizationDisabled() {
32 |   static const bool kPartitionByModuloOptimizationDisabled =
33 |       ::hybridbackend::EnvVarGetBool(
34 |           "HB_OP_PARTITION_BY_MODULO_OPTIMIZATION_DISABLED", false);
35 |   return kPartitionByModuloOptimizationDisabled;
36 | }
37 | 
38 | inline bool PartitionByModuloPackingDisabled() {
39 |   static const bool kPartitionByModuloPackingDisabled =
40 |       ::hybridbackend::EnvVarGetBool(
41 |           "HB_OP_PARTITION_BY_MODULO_PACKING_DISABLED", false);
42 |   return kPartitionByModuloPackingDisabled;
43 | }
44 | 
45 | }  // namespace
46 | 
47 | class OptimizePartitionByModuloReplacingPass : public OpOptimizationPass {
48 |  public:
49 |   Status Optimize(Graph* graph, const SessionOptions* options,
50 |                   const bool disabled) override {
51 |     if (TF_PREDICT_FALSE(disabled || PartitionByModuloOptimizationDisabled())) {
52 |       return Status::OK();
53 |     }
54 | 
55 |     TF_RETURN_IF_ERROR(Rewrite("PartitionByModulo", "HbPartitionByModulo")
56 |                            .WithIntAttr("num_partitions", 1)
57 |                            .In(graph));
58 | 
59 |     return Status::OK();
60 |   }
61 | };
62 | 
63 | REGISTER_REPLACING_OPTIMIZATION(OptimizePartitionByModuloReplacingPass);
64 | 
65 | class OptimizePartitionByModuloReductionPass : public OpOptimizationPass {
66 |  public:
67 |   Status Optimize(Graph* graph, const SessionOptions* options,
68 |                   const bool disabled) override {
69 |     if (TF_PREDICT_FALSE(disabled || PartitionByModuloOptimizationDisabled())) {
70 |       return Status::OK();
71 |     }
72 | 
73 |     if (TF_PREDICT_TRUE(!PartitionByModuloPackingDisabled())) {
74 |       TF_RETURN_IF_ERROR(
75 |           Pack("HbPartitionByModulo", "HbPartitionByModuloN")
76 |               .WithTypeAttr("T", {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64})
77 |               .WithIntAttr("num_partitions")
78 |               .In(graph));
79 | 
80 |       return Status::OK();
81 |     }
82 | 
83 |     return Status::OK();
84 |   }
85 | };
86 | 
87 | REGISTER_REDUCTION_OPTIMIZATION(OptimizePartitionByModuloReductionPass);
88 | }  // namespace hybridbackend
89 | }  // namespace tensorflow
90 | 
91 | #endif  // HYBRIDBACKEND_TENSORFLOW
92 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tabular/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Dataset that reads tabular data.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.util.deprecation import deprecated
24 | 
25 | # pylint: disable=ungrouped-imports
26 | try:
27 |   from hybridbackend.tensorflow.data.tabular.dataset_v2 import \
28 |     TabularDatasetV2 as Dataset
29 |   Dataset.__module__ = __name__
30 |   Dataset.__name__ = 'TabularDataset'
31 | except ImportError:
32 |   from hybridbackend.tensorflow.data.tabular.dataset_v1 import \
33 |     TabularDatasetV1 as Dataset
34 |   Dataset.__module__ = __name__
35 |   Dataset.__name__ = 'TabularDataset'
36 | 
37 | try:
38 |   from tensorflow.python.data.ops.dataset_ops import DatasetV2 as _dataset  # pylint: disable=unused-import, line-too-long # noqa: F401
39 | 
40 |   from hybridbackend.tensorflow.data.tabular.dataset_v2 import \
41 |     ParquetDatasetV2 as ParquetDataset
42 |   ParquetDataset.__module__ = __name__
43 |   ParquetDataset.__name__ = 'ParquetDataset'
44 | except ImportError:
45 |   from hybridbackend.tensorflow.data.tabular.dataset_v1 import \
46 |     ParquetDatasetV1 as ParquetDataset
47 |   ParquetDataset.__module__ = __name__
48 |   ParquetDataset.__name__ = 'ParquetDataset'
49 | # pylint: enable=ungrouped-imports
50 | 
51 | 
52 | @deprecated(None, 'Prefer hb.data.Dataset.from_parquet instead.')
53 | def read_parquet(
54 |     batch_size,
55 |     fields=None,
56 |     partition_count=1,
57 |     partition_index=0,
58 |     drop_remainder=False,
59 |     num_parallel_reads=None,
60 |     num_sequential_reads=1):
61 |   r'''Create a `ParquetDataset` from filenames dataset.
62 | 
63 |     Args:
64 |       batch_size: Maxium number of samples in an output batch.
65 |       fields: (Optional.) List of DataFrame fields.
66 |       partition_count: (Optional.) Count of row group partitions.
67 |       partition_index: (Optional.) Index of row group partitions.
68 |       drop_remainder: (Optional.) If True, only keep batches with exactly
69 |         `batch_size` samples.
70 |       num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
71 |         number of files to read in parallel. Defaults to reading files
72 |         sequentially.
73 |       num_sequential_reads: (Optional.) A `tf.int64` scalar representing the
74 |         number of batches to read in sequential. Defaults to 1.
75 |     '''
76 |   def _apply_fn(filenames):
77 |     return ParquetDataset(
78 |       filenames,
79 |       batch_size=batch_size,
80 |       fields=fields,
81 |       partition_count=partition_count,
82 |       partition_index=partition_index,
83 |       drop_remainder=drop_remainder,
84 |       num_parallel_reads=num_parallel_reads,
85 |       num_sequential_reads=num_sequential_reads)
86 |   return _apply_fn
87 | 


--------------------------------------------------------------------------------
/docs/tutorial/ranking/optimization.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Functions for optimization
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import tensorflow as tf
24 | 
25 | 
26 | def lr_with_linear_warmup_and_polynomial_decay(
27 |     global_step,
28 |     initial_value=24.,
29 |     scaling_factor=1.,
30 |     warmup_steps=None,
31 |     decay_steps=None,
32 |     decay_start_step=None,
33 |     decay_exp=2,
34 |     epsilon=1.e-7):
35 |   r'''Calculates learning rate with linear warmup and polynomial decay.
36 | 
37 |   Args:
38 |     global_step: Variable representing the current step.
39 |     initial_value: Initial value of learning rates.
40 |     warmup_steps: Steps of warmup.
41 |     decay_steps: Steps of decay.
42 |     decay_start_step: Start step of decay.
43 |     decay_exp: Exponent part of decay.
44 |     scaling_factor: Factor for scaling.
45 | 
46 |   Returns:
47 |     New learning rate tensor.
48 |   '''
49 |   initial_lr = tf.constant(initial_value * scaling_factor, tf.float32)
50 | 
51 |   if warmup_steps is None:
52 |     return initial_lr
53 | 
54 |   global_step = tf.cast(global_step, tf.float32)
55 |   warmup_steps = tf.constant(warmup_steps, tf.float32)
56 |   warmup_rate = initial_lr / warmup_steps
57 |   warmup_lr = initial_lr - (warmup_steps - global_step) * warmup_rate
58 | 
59 |   if decay_steps is None or decay_start_step is None:
60 |     return warmup_lr
61 | 
62 |   decay_start_step = tf.constant(decay_start_step, tf.float32)
63 |   steps_since_decay_start = global_step - decay_start_step
64 |   decay_steps = tf.constant(decay_steps, tf.float32)
65 |   decayed_steps = tf.minimum(steps_since_decay_start, decay_steps)
66 |   to_decay_rate = (decay_steps - decayed_steps) / decay_steps
67 |   decay_lr = initial_lr * to_decay_rate**decay_exp
68 |   decay_lr = tf.maximum(decay_lr, tf.constant(epsilon))
69 | 
70 |   warmup_lambda = tf.cast(global_step < warmup_steps, tf.float32)
71 |   decay_lambda = tf.cast(global_step > decay_start_step, tf.float32)
72 |   initial_lambda = tf.cast(
73 |     tf.math.abs(warmup_lambda + decay_lambda) < epsilon, tf.float32)
74 | 
75 |   lr = warmup_lambda * warmup_lr
76 |   lr += decay_lambda * decay_lr
77 |   lr += initial_lambda * initial_lr
78 |   return lr
79 | 
80 | 
81 | def sgd_decay_optimize(
82 |     loss,
83 |     lr_initial_value,
84 |     lr_warmup_steps,
85 |     lr_decay_start_step,
86 |     lr_decay_steps):
87 |   r'''Optimize using SGD and learning rate decay.
88 |   '''
89 |   step = tf.train.get_or_create_global_step()
90 |   lr = lr_with_linear_warmup_and_polynomial_decay(
91 |     step,
92 |     initial_value=lr_initial_value,
93 |     warmup_steps=lr_warmup_steps,
94 |     decay_start_step=lr_decay_start_step,
95 |     decay_steps=lr_decay_steps)
96 |   opt = tf.train.GradientDescentOptimizer(learning_rate=lr)
97 |   return opt.minimize(loss, global_step=step)
98 | 


--------------------------------------------------------------------------------
/docs/tutorial/ranking/criteo/data/prep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # =============================================================================
17 | 
18 | r'''Prepare Criteo 1TB Click Logs Dataset.
19 | 
20 | See https://ailab.criteo.com/download-criteo-1tb-click-logs-dataset/ for more
21 | information.
22 | '''
23 | 
24 | from __future__ import absolute_import
25 | from __future__ import division
26 | from __future__ import print_function
27 | 
28 | import argparse
29 | import os
30 | import warnings
31 | 
32 | import numpy as np
33 | import pandas as pd
34 | import pyarrow as pa
35 | import pyarrow.parquet as pq
36 | import tqdm
37 | 
38 | 
39 | def main(args):
40 |   label_names = [args.label_prefix]
41 |   if_names = [f'{args.integer_features_prefix}{i}' for i in range(13)]
42 |   cf_names = [f'{args.categorical_features_prefix}{i}' for i in range(26)]
43 | 
44 |   pa_schema = pa.schema(
45 |     [(n, pa.int32()) for n in label_names]
46 |     + [(n, pa.int32()) for n in if_names]
47 |     + [(n, pa.int64()) for n in cf_names])
48 |   pd_schema = dict(
49 |     [(n, np.int32) for n in label_names]
50 |     + [(n, np.int32) for n in if_names]
51 |     + [(n, np.int64) for n in cf_names]
52 |   )
53 | 
54 |   converters = dict(
55 |     [(n, np.int32) for n in label_names]
56 |     + [(n, lambda i: int(i) if i else args.null_value) for n in if_names]
57 |     + [(n, lambda i: int(i, 16) if i else args.null_value) for n in cf_names]
58 |   )
59 | 
60 |   parquet_fname = f'{os.path.splitext(args.fname)[0]}.parquet'
61 |   try:
62 |     with pq.ParquetWriter(
63 |         parquet_fname, pa_schema,
64 |         use_dictionary=not args.no_use_dictionary,
65 |         compression=args.compression,
66 |         flavor=args.flavor) as writer:
67 |       for dfc in tqdm.tqdm(
68 |           pd.read_csv(
69 |             args.fname,
70 |             sep='\t',
71 |             names=label_names + if_names + cf_names,
72 |             converters=converters,
73 |             chunksize=args.row_group_size),
74 |           desc=f'Prepare dataset from {args.fname}',
75 |           unit='blocks'):
76 |         pt = pa.Table.from_pandas(dfc.astype(pd_schema), preserve_index=False)
77 |         writer.write_table(pt)
78 |         del pt
79 |   except Exception:
80 |     warnings.warn(
81 |       f'Failed to prepare dataset from {args.fname}',
82 |       RuntimeWarning)
83 |     raise
84 | 
85 | 
86 | if __name__ == '__main__':
87 |   parser = argparse.ArgumentParser()
88 |   parser.add_argument('--label-prefix', default='label')
89 |   parser.add_argument('--integer-features-prefix', default='if')
90 |   parser.add_argument('--categorical-features-prefix', default='cf')
91 |   parser.add_argument('--compression', default='zstd')
92 |   parser.add_argument('--flavor', default='spark')
93 |   parser.add_argument('--no-use-dictionary', default=False, action='store_true')
94 |   parser.add_argument('--row-group-size', type=int, default=1000000)
95 |   parser.add_argument('--null-value', type=int, default=-1 << 16)
96 |   parser.add_argument('fname')
97 |   main(parser.parse_args())
98 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tests/parquet_dataset_ragged_nested_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Parquet batch dataset nested ragged tensors test.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import os
24 | import tempfile
25 | import unittest
26 | 
27 | import numpy as np
28 | import pyarrow as pa
29 | import pyarrow.parquet as pq
30 | import tensorflow as tf
31 | 
32 | import hybridbackend.common.test as hbtest
33 | import hybridbackend.tensorflow as hb
34 | 
35 | 
36 | # pylint: disable=missing-docstring
37 | class ParquetDatasetRaggedNestedTest(unittest.TestCase):
38 |   def setUp(self):  # pylint: disable=invalid-name
39 |     os.environ['CUDA_VISIBLE_DEVICES'] = ''
40 |     self._workspace = tempfile.mkdtemp()
41 |     self._filename = os.path.join(
42 |       self._workspace, 'ragged_test_pyarrow.parquet')
43 |     self._data = pa.array(
44 |       [[[1], [2, 3]], [[4], [5]]], pa.list_(pa.list_(pa.int64())))
45 |     table = pa.Table.from_arrays([self._data], ['A'])
46 |     pq.write_table(table, self._filename, compression='ZSTD')
47 | 
48 |   def tearDown(self):  # pylint: disable=invalid-name
49 |     os.remove(self._filename)
50 |     del os.environ['CUDA_VISIBLE_DEVICES']
51 | 
52 |   def test_read(self):
53 |     with tf.Graph().as_default() as graph:
54 |       ds = hb.data.ParquetDataset(
55 |         [self._filename],
56 |         batch_size=2)
57 |       ds = ds.apply(hb.data.rebatch(2))
58 |       ds = ds.prefetch(4)
59 |       batch = tf.data.make_one_shot_iterator(ds).get_next()
60 | 
61 |     with tf.Session(graph=graph) as sess:
62 |       actual = sess.run(batch)['A'].to_list()
63 |       expected = self._data.to_pylist()
64 |       np.testing.assert_equal(actual, expected)
65 | 
66 |   def test_apply_to_sparse(self):
67 |     with tf.Graph().as_default() as graph:
68 |       ds = hb.data.Dataset.from_parquet([self._filename])
69 |       ds = ds.batch(2)
70 |       batch = tf.data.make_one_shot_iterator(ds).get_next()['A']
71 |       baseline = tf.ragged.constant(self._data.to_pylist()).to_sparse()
72 | 
73 |     with tf.Session(graph=graph) as sess:
74 |       actual, expected = sess.run([batch, baseline])
75 |       np.testing.assert_equal(actual.indices, expected.indices)
76 |       np.testing.assert_equal(actual.values, expected.values)
77 |       np.testing.assert_equal(actual.dense_shape, expected.dense_shape)
78 | 
79 |   def test_apply_to_tensor(self):
80 |     with tf.Graph().as_default() as graph:
81 |       ds = hb.data.Dataset.from_parquet([self._filename], to_dense=True)
82 |       ds = ds.batch(2)
83 |       batch = tf.data.make_one_shot_iterator(ds).get_next()['A']
84 |       baseline = tf.ragged.constant(self._data.to_pylist()).to_tensor()
85 | 
86 |     with tf.Session(graph=graph) as sess:
87 |       actual, expected = sess.run([batch, baseline])
88 |       np.testing.assert_equal(actual, expected)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |   hbtest.main(f'{__file__}.xml')
93 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # =============================================================================
 15 | 
 16 | r'''Test for out-of-range detect.
 17 | '''
 18 | 
 19 | from __future__ import absolute_import
 20 | from __future__ import division
 21 | from __future__ import print_function
 22 | 
 23 | import os
 24 | import unittest
 25 | 
 26 | import numpy as np
 27 | 
 28 | import hybridbackend.common.test as hbtest
 29 | 
 30 | 
 31 | # pylint: disable=missing-docstring
 32 | def _test_single(_):
 33 |   r'''Testing on a single worker
 34 |   '''
 35 |   # pylint: disable=import-outside-toplevel
 36 |   import tensorflow as tf
 37 | 
 38 |   import hybridbackend.tensorflow as hb
 39 | 
 40 |   batch_size = 10
 41 | 
 42 |   with tf.Graph().as_default():
 43 |     with hb.scope(mode=tf.estimator.ModeKeys.TRAIN):
 44 |       with tf.device('/cpu:0'):
 45 |         ds = tf.data.Dataset.range(100)
 46 |         ds = ds.batch(batch_size=batch_size)
 47 |         iterator = tf.data.make_one_shot_iterator(ds)
 48 |         batch = iterator.get_next()
 49 |       with tf.train.MonitoredTrainingSession('') as sess:
 50 |         final_result = None
 51 |         while not sess.should_stop():
 52 |           final_result = sess.run(batch)
 53 |         return final_result
 54 | 
 55 | 
 56 | def _test_distributed(rank):
 57 |   r'''Testing on multiple distributed workers
 58 |   '''
 59 |   # pylint: disable=import-outside-toplevel
 60 |   import tensorflow as tf
 61 | 
 62 |   import hybridbackend.tensorflow as hb
 63 | 
 64 |   batch_size = 10
 65 | 
 66 |   with tf.Graph().as_default():
 67 |     with hb.scope(
 68 |         data_sync_drop_remainder=False, mode=tf.estimator.ModeKeys.TRAIN):
 69 |       with tf.device('/cpu:0'):
 70 |         ds = tf.data.Dataset.range(100 + rank * 50)
 71 |         ds = ds.batch(batch_size=batch_size)
 72 |         iterator = tf.data.make_one_shot_iterator(ds)
 73 |         batch = iterator.get_next()
 74 |       with tf.train.MonitoredTrainingSession('') as sess:
 75 |         final_result = None
 76 |         while not sess.should_stop():
 77 |           final_result = sess.run(batch)
 78 |         return final_result
 79 | 
 80 | 
 81 | @unittest.skipUnless(
 82 |   os.getenv('HYBRIDBACKEND_WITH_CUDA') == 'ON', 'GPU required')
 83 | @unittest.skipUnless(
 84 |   os.getenv('HYBRIDBACKEND_WITH_NCCL') == 'ON', 'NCCL required')
 85 | class DetectEndTest(unittest.TestCase):
 86 |   r'''Tests for the out-of-range sync.
 87 |   '''
 88 |   def setUp(self):  # pylint: disable=invalid-name
 89 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
 90 | 
 91 |   def test_single(self):
 92 |     results = hbtest.Spawn()(_test_single)
 93 |     np.testing.assert_equal(
 94 |       results[0], [90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
 95 | 
 96 |   def test_parallel(self):
 97 |     results = hbtest.Spawn(2)(_test_distributed)
 98 |     np.testing.assert_equal(results[0], [])
 99 |     np.testing.assert_equal(
100 |       results[1], [140, 141, 142, 143, 144, 145, 146, 147, 148, 149])
101 | 
102 | 
103 | if __name__ == '__main__':
104 |   hbtest.main(f'{__file__}.xml')
105 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/training/variables.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alibaba Group Holding Limited. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 | 
16 | r'''Variable utilities for training.
17 | '''
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import contextlib
24 | 
25 | from tensorflow.python.framework import ops
26 | from tensorflow.python.keras.backend import reset_uids as reset_keras_uids
27 | from tensorflow.python.ops import math_ops
28 | from tensorflow.python.ops import state_ops
29 | from tensorflow.python.ops import variable_scope as vs
30 | 
31 | 
32 | class ReuseVariables(object):  # pylint: disable=useless-object-inheritance
33 |   r'''Variable reusing context.
34 |   '''
35 |   def __call__(self, reuse):
36 |     reset_keras_uids()
37 |     varscope = ops.get_default_graph().get_collection_ref(('__varscope',))
38 |     if varscope:
39 |       varscope[0].variable_scopes_count.clear()
40 |     vs.get_variable_scope()._reuse = reuse  # pylint: disable=protected-access
41 | 
42 | 
43 | @contextlib.contextmanager
44 | def reuse_variables(reuse=None):
45 |   r'''Context manager that reuses variables.
46 |   '''
47 |   try:
48 |     fn = ReuseVariables()
49 |     prev_reuse = vs.get_variable_scope()._reuse  # pylint: disable=protected-access
50 |     if reuse is not None:
51 |       fn(reuse)
52 |     yield fn
53 |   finally:
54 |     vs.get_variable_scope()._reuse = prev_reuse  # pylint: disable=protected-access
55 | 
56 | 
57 | @contextlib.contextmanager
58 | def disable_variable_update():
59 |   r'''Context manager that disable update in state_ops's assign operations
60 |   '''
61 |   try:
62 |     def wraps_assign(assign_fn):  # pylint: disable=unused-argument
63 |       r'''Disable the assign op
64 |       '''
65 |       def wrapped_assign(
66 |           ref, value, validate_shape=None, use_locking=None, name=None):  # pylint: disable=unused-argument
67 |         return value
68 |       return wrapped_assign
69 | 
70 |     def wraps_assign_sub(assign_sub_fn):  # pylint: disable=unused-argument
71 |       r'''Disable the assign_sub op
72 |       '''
73 |       def wrapped_assign_sub(ref, value, use_locking=None, name=None):  # pylint: disable=unused-argument
74 |         return math_ops.subtract(ref, value)
75 |       return wrapped_assign_sub
76 | 
77 |     def wraps_assign_add(assign_add_fn):  # pylint: disable=unused-argument
78 |       r'''Disable the assign_add op
79 |       '''
80 |       def wrapped_assign_add(ref, value, use_locking=None, name=None):  # pylint: disable=unused-argument
81 |         return math_ops.add(ref, value)
82 |       return wrapped_assign_add
83 | 
84 |     prev_assign = state_ops.assign
85 |     state_ops.assign = wraps_assign(prev_assign)
86 |     prev_assign_sub = state_ops.assign_sub
87 |     state_ops.assign_sub = wraps_assign_sub(prev_assign_sub)
88 |     prev_assign_add = state_ops.assign_add
89 |     state_ops.assign_add = wraps_assign_add(prev_assign_add)
90 | 
91 |     yield
92 | 
93 |   finally:
94 |     state_ops.assign = prev_assign
95 |     state_ops.assign_sub = prev_assign_sub
96 |     state_ops.assign_add = prev_assign_add
97 | 


--------------------------------------------------------------------------------
/hybridbackend/tensorflow/Makefile:
--------------------------------------------------------------------------------
  1 | TENSORFLOW_SRC := hybridbackend/tensorflow/
  2 | 
  3 | ifeq ($(HYBRIDBACKEND_WITH_BUILDINFO),ON)
  4 | HYBRIDBACKEND_BUILD_FRAMEWORK := $(shell \
  5 | 	$(PYTHON) -c \
  6 | 	"import tensorflow as tf; print('tf{}-{}'.format(tf.__version__, tf.__git_version__))" \
  7 | 	2>/dev/null)
  8 | CFLAGS := $(CFLAGS) \
  9 | 	-DHYBRIDBACKEND_BUILD_FRAMEWORK="\"$(HYBRIDBACKEND_BUILD_FRAMEWORK)\""
 10 | endif
 11 | 
 12 | TENSORFLOW_CFLAGS := \
 13 | 	-DEIGEN_MPL2_ONLY \
 14 | 	-DEIGEN_MAX_ALIGN_BYTES=64 \
 15 | 	-DEIGEN_HAS_TYPE_TRAITS=0 \
 16 | 	$(shell \
 17 | 	$(PYTHON) -c \
 18 | 	"import tensorflow as tf; cflags=tf.sysconfig.get_compile_flags(); print(' '.join([c.replace('-I', '-isystem ', 1) if c.startswith('-I') else c for c in cflags]))" 2>/dev/null)
 19 | 
 20 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON)
 21 | TENSORFLOW_CFLAGS := $(TENSORFLOW_CFLAGS) -DGOOGLE_CUDA=1
 22 | endif
 23 | 
 24 | ifeq ($(OS),Darwin)
 25 | TENSORFLOW_LDFLAGS := \
 26 | 	$(shell \
 27 | 	$(PYTHON) -c \
 28 | 	"import tensorflow as tf; ldflags=tf.sysconfig.get_link_flags(); print(' '.join(ldflags))" 2>/dev/null)
 29 | TENSORFLOW_LDFLAGS := $(subst -l:libtensorflow_framework.1.dylib,-ltensorflow_framework,$(TENSORFLOW_LDFLAGS))
 30 | else
 31 |  TENSORFLOW_LDFLAGS := \
 32 | 	-Wl,-rpath='$$ORIGIN/..:$$ORIGIN/../../tensorflow' \
 33 |  	$(shell \
 34 | 	$(PYTHON) -c \
 35 |  	"import tensorflow as tf; ldflags=tf.sysconfig.get_link_flags(); print(' '.join(ldflags))" 2>/dev/null)
 36 | endif
 37 | 
 38 | TENSORFLOW_CC_SOURCES := $(shell \
 39 | 	find $(TENSORFLOW_SRC) -type f \
 40 | 	\( -name "*.cc" ! -name "*.cu*" \) \
 41 | 	-exec realpath {} --relative-to . \;)
 42 | 
 43 | TENSORFLOW_OBJS := $(TENSORFLOW_CC_SOURCES:.cc=.o)
 44 | ifeq ($(OS),Darwin)
 45 | $(TENSORFLOW_OBJS): %.o:%.cc $(THIRDPARTY_DEPS)
 46 | 	mkdir -p $(dir $@)
 47 | 	$(CXX) $(CFLAGS) $(TENSORFLOW_CFLAGS) $(CXX_CFLAGS) \
 48 | 	-MMD -MP -MF $<.d -o $@ -c $< -fpic
 49 | else
 50 | $(TENSORFLOW_OBJS): %.o:%.cc $(THIRDPARTY_DEPS)
 51 | 	mkdir -p $(dir $@)
 52 | 	$(CXX) $(CFLAGS) $(TENSORFLOW_CFLAGS) $(CXX_CFLAGS) \
 53 | 	-MMD -MP -MF $<.d -o $@ -c $< -fpic
 54 | 	sed -i '/site-packages/d' $<.d
 55 | 	sed -i '/^$$/N;/^\n$$/D' $<.d
 56 | endif
 57 | 
 58 | ifeq ($(HYBRIDBACKEND_WITH_CUDA),ON)
 59 | TENSORFLOW_CU_SOURCES := $(shell \
 60 | 	find $(TENSORFLOW_SRC) -type f \
 61 | 	\( -name '*.cu.cc' \) \
 62 | 	-exec realpath {} --relative-to . \;)
 63 | 
 64 | TENSORFLOW_CU_OBJS := $(TENSORFLOW_CU_SOURCES:.cc=.o)
 65 | ifeq ($(OS),Darwin)
 66 | $(TENSORFLOW_CU_OBJS): %.o:%.cc
 67 | 	mkdir -p $(dir $@)
 68 | 	$(NVCC) $(NVCC_CFLAGS) \
 69 | 		-o $@ -c $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \
 70 | 		-Xcompiler -fPIC
 71 | else
 72 | $(TENSORFLOW_CU_OBJS): %.o:%.cc
 73 | 	mkdir -p $(dir $@)
 74 | 	@$(NVCC) -M $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \
 75 | 	 | grep -v '/usr/' \
 76 | 	 | grep -v 'site-packages' \
 77 | 	 | sed 's|$(notdir $@)|$@|g' \
 78 | 	 | sed 's|^\./||g' \
 79 | 	 > $<.d
 80 | 	sed -i '/^$$/N;/^\n$$/D' $<.d
 81 | 	$(NVCC) $(NVCC_CFLAGS) \
 82 | 		-o $@ -c $< $(CFLAGS) $(TENSORFLOW_CFLAGS) -x cu \
 83 | 		-Xcompiler -fPIC
 84 | endif
 85 | TENSORFLOW_ALL_OBJS := $(TENSORFLOW_OBJS) $(TENSORFLOW_CU_OBJS)
 86 | else
 87 | TENSORFLOW_ALL_OBJS := $(TENSORFLOW_OBJS)
 88 | endif
 89 | 
 90 | ifeq ($(OS),Darwin)
 91 | $(TENSORFLOW_LIB): $(TENSORFLOW_ALL_OBJS) $(COMMON_LIB)
 92 | 	mkdir -p $(dir $@)
 93 | 	$(CXX) $(CFLAGS) -std=c++11 \
 94 | 	-install_name @rpath/lib$(LIBNAME)_tensorflow.so \
 95 | 	-o $@ $(TENSORFLOW_ALL_OBJS) \
 96 | 	$(LDFLAGS) \
 97 | 	$(TENSORFLOW_LDFLAGS) \
 98 | 	-L$(LIBNAME)/ -l$(LIBNAME)
 99 | else
100 | $(TENSORFLOW_LIB): $(TENSORFLOW_ALL_OBJS) $(COMMON_LIB)
101 | 	mkdir -p $(dir $@)
102 | 	$(CXX) $(CFLAGS) -std=c++11 \
103 | 	-o $@ $(TENSORFLOW_ALL_OBJS) \
104 | 	$(LDFLAGS) \
105 | 	$(TENSORFLOW_LDFLAGS) \
106 | 	-L$(LIBNAME)/ -l$(LIBNAME)
107 | endif
108 | 


--------------------------------------------------------------------------------