├── .gitignore
├── LICENSE
├── README.md
├── bench
    ├── dlrm_s_benchmark.sh
    ├── dlrm_s_criteo_kaggle.sh
    ├── dlrm_s_criteo_kaggle_C1.sh
    ├── dlrm_s_criteo_kaggle_C1_C2.sh
    ├── dlrm_s_criteo_kaggle_C1_C2_C3.sh
    ├── dlrm_s_criteo_kaggle_lock_gpu_C1.sh
    ├── dlrm_s_criteo_terabyte.sh
    └── run_and_time.sh
├── cache_algo
    ├── EvLFU_C1.py
    ├── EvLFU_C1_Cython
    │   ├── EvLFU.cpp
    │   ├── EvLFU.cpython-36m-x86_64-linux-gnu.so
    │   ├── EvLFU.pyx
    │   ├── evlfu.hpp
    │   ├── script.sh
    │   ├── setup_EvLFU.py
    │   └── test.py
    ├── LFU.py
    ├── LRU.py
    ├── cpp_socket_client.py
    └── old_versions
    │   ├── EvLFU4DLRM_C2.py
    │   ├── EvLFU_C1_apprx_emb.py
    │   ├── EvLFU_C1_sets.py
    │   ├── EvLFU_C1_v0.py
    │   ├── EvLFU_C1_v1.py
    │   ├── LFU_v0.py
    │   ├── LFU_v1.py
    │   ├── LFU_v2.py
    │   └── LRU_v0.py
├── cython
    ├── cython_compile.py
    └── cython_criteo.py
├── data_utils.py
├── dlrm_data_pytorch.py
├── dlrm_s_pytorch.py
├── dlrm_s_pytorch_C1.py
├── dlrm_s_pytorch_C1_C2.py
├── dlrm_s_pytorch_C1_C2_C3.py
├── dlrm_s_pytorch_lock_gpu_C1.py
├── emb_storage
    ├── file_read.py
    ├── mmap_file_read.py
    ├── multi_storage_dummy
    │   └── socket-server.py
    ├── storage_dummy.py
    ├── storage_manager.py
    ├── storage_rocksdb.py
    ├── storage_rocksdb_26_tabs.py
    ├── storage_sqlite.py
    └── storage_sqlite_26_tabs.py
├── evstore_utils.py
├── experiments.md
├── extend_distributed.py
├── input
    ├── .gitignore
    └── readme.txt
├── logs
    ├── .gitignore
    ├── sample-inference-criteo_kaggle_5mil.txt
    ├── sample-inference-criteo_kaggle_all.txt
    └── sample-train-criteo_kaggle_5mil.txt
├── misc
    ├── README.txt
    ├── dlrm_data_caffe2.py
    ├── dlrm_s_caffe2.py
    ├── mixed_precs_caching_v0
    │   ├── .gitignore
    │   ├── cache_manager.cpp
    │   ├── cache_manager.hpp
    │   ├── dlrm_client.py
    │   ├── evlfu_16.cpp
    │   ├── evlfu_16.hpp
    │   ├── evlfu_32.cpp
    │   ├── evlfu_32.hpp
    │   ├── evlfu_4.cpp
    │   ├── evlfu_4.hpp
    │   ├── evlfu_8.cpp
    │   ├── evlfu_8.hpp
    │   ├── readme.txt
    │   └── test.cpp
    └── testing_tensor_cpp
    │   ├── CMakeLists.txt
    │   ├── evlfu_tensor.cpp
    │   ├── evlfu_tensor.hpp
    │   └── sample_client.py
├── mixed_precs_caching
    ├── .gitignore
    ├── aprx_embedding.cpp
    ├── aprx_embedding.hpp
    ├── cache_manager.cpp
    ├── cache_manager.hpp
    ├── dlrm_client.py
    ├── evlfu_16.cpp
    ├── evlfu_16.hpp
    ├── evlfu_32.cpp
    ├── evlfu_32.hpp
    ├── evlfu_4.cpp
    ├── evlfu_4.hpp
    ├── evlfu_8.cpp
    ├── evlfu_8.hpp
    ├── lib
    │   └── .gitignore
    ├── readme.txt
    ├── test.cpp
    └── test.py
├── mlperf_logger.py
├── optim
    └── rwsadagrad.py
├── script
    ├── apply_ev_preconditioning.py
    ├── approximate_embedding
    │   └── phase2_similarity_analysis
    │   │   ├── README.txt
    │   │   ├── csvReader.py
    │   │   ├── get_neighbors_CPU_slow.ipynb
    │   │   ├── get_neighbors_GPU.ipynb
    │   │   ├── most_popular_neighbor.ipynb
    │   │   └── rankedWorkload.csv
    ├── compress_folder_for_github.sh
    ├── convert_altkeys_to_binary.py
    ├── convert_ev_to_binary.py
    ├── data_loader_terabyte.py
    ├── dissectingmodel.py
    ├── free_page_cache.sh
    ├── gnuplot_cdf_direct_io.plt
    ├── gnuplot_cdf_evlfu_lru.plt
    ├── gnuplot_cdf_multi_line.plt
    ├── gnuplot_graph
    │   └── cdf_2_line.plt
    ├── modify_param.py
    ├── mount_cham_obj_stor.sh
    ├── plot_cdf.py
    ├── read_cham_obj_stor.sh
    ├── reduce_precision.py
    ├── uncompress_folder_for_github.sh
    └── wget_evstore_dataset.sh
├── stored_model
    └── .gitignore
├── test
    └── dlrm_s_test.sh
├── tools
    └── visualize.py
└── tricks
    ├── md_embedding_bag.py
    └── qr_embedding_bag.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | logs-old/
 3 | !logs
 4 | *__pycache__
 5 | */__pycache__
 6 | */*__pycache__
 7 | */*/__pycache__
 8 | *.out
 9 | run_kaggle_pt
10 | model.pth
11 | *.DS_Store
12 | */.DS_Store
13 | */*.DS_Store
14 | */*/.DS_Store
15 | *.ipynb_checkpoints
16 | */.ipynb_checkpoints
17 | */*.ipynb_checkpoints
18 | */*/.ipynb_checkpoints
19 | file_to_download.txt
20 | index.html
21 | test.txt
22 | out.txt
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-3.0.en.html)
 2 | [![Platform](https://img.shields.io/badge/Platform-x86--64-brightgreen)](https://shields.io/)
 3 | 
 4 | ```
 5 |   _______     ______  _                 
 6 |  | ____\ \   / / ___|| |_ ___  _ __ ___ 
 7 |  |  _|  \ \ / /\___ \| __/ _ \| '__/ _ \
 8 |  | |___  \ V /  ___) | || (_) | | |  __/
 9 |  |_____|  \_/  |____/ \__\___/|_|  \___| -- Groupability-aware caching systems for DRS
10 | 
11 | ```
12 | 
13 | This repository contains the implementation code for paper:<br>
14 | **EVSTORE: Storage and Caching Capabilities for Scaling
15 | Embedding Tables in Deep Recommendation Systems**<br>
16 |                         
17 | Contact Information
18 | --------------------
19 | 
20 | **Maintainer**: [Daniar H. Kurniawan](https://people.cs.uchicago.edu/~daniar/), Email: ``daniar@uchicago.edu``
21 | 
22 | [//]: <> (**Daniar is on the job market.** Please contact him if you have an opening for an AIOps and ML-Sys engineer role!)
23 | 
24 | Feel free to contact Daniar for any suggestions/feedback, bug
25 | reports, or general discussions.
26 | 
27 | Please consider citing our EVStore paper at ASPLOS 2023 if you use EVStore. The bib
28 | entry is
29 | 
30 | ```
31 | @InProceedings{Daniar-EVStore, 
32 | Author = {Daniar H. Kurniawan and Ruipu Wang and Kahfi S. Zulkifli and Fandi A. Wiranata and John Bent and Ymir Vigfusson and Haryadi S. Gunawi},
33 | Title = "EVSTORE: Storage and Caching Capabilities for Scaling
34 | Embedding Tables in Deep Recommendation Systems",
35 | Booktitle =  {Proceedings of the 28th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)},
36 | Address = {Vancouver, Canada},
37 | Month =  {MARCH},
38 | Year =  {2023}
39 | }
40 | ```
41 | 
42 | Run EVStore
43 | -----------
44 | 
45 | Please follow the experiments detailed in [Experiments.md](experiments.md).
46 | 
47 | 
48 | ### Acknowledgement ###
49 | 
50 | The DLRM code in this repository is based on [Facebook DLRM](https://github.com/facebookresearch/dlrm).
51 | The cache benchmark repository is based on [Cache2k](https://github.com/cache2k/cache2k) and [Cacheus](https://github.com/sylab/cacheus/).
52 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | #check if extra argument is passed to the test
  8 | if [[ $# == 1 ]]; then
  9 |     dlrm_extra_option=$1
 10 | else
 11 |     dlrm_extra_option=""
 12 | fi
 13 | #echo $dlrm_extra_option
 14 | 
 15 | cpu=1
 16 | gpu=1
 17 | pt=1
 18 | c2=1
 19 | 
 20 | ncores=28 #12 #6
 21 | nsockets="0"
 22 | 
 23 | ngpus="1 2 4 8"
 24 | 
 25 | numa_cmd="numactl --physcpubind=0-$((ncores-1)) -m $nsockets" #run on one socket, without HT
 26 | dlrm_pt_bin="python dlrm_s_pytorch.py"
 27 | dlrm_c2_bin="python dlrm_s_caffe2.py"
 28 | 
 29 | data=random #synthetic
 30 | print_freq=100
 31 | rand_seed=727
 32 | 
 33 | c2_net="async_scheduling"
 34 | 
 35 | #Model param
 36 | mb_size=2048 #1024 #512 #256
 37 | nbatches=1000 #500 #100
 38 | bot_mlp="512-512-64"
 39 | top_mlp="1024-1024-1024-1"
 40 | emb_size=64
 41 | nindices=100
 42 | emb="1000000-1000000-1000000-1000000-1000000-1000000-1000000-1000000"
 43 | interaction="dot"
 44 | tnworkers=0
 45 | tmb_size=16384
 46 | 
 47 | #_args="--mini-batch-size="${mb_size}\
 48 | _args=" --num-batches="${nbatches}\
 49 | " --data-generation="${data}\
 50 | " --arch-mlp-bot="${bot_mlp}\
 51 | " --arch-mlp-top="${top_mlp}\
 52 | " --arch-sparse-feature-size="${emb_size}\
 53 | " --arch-embedding-size="${emb}\
 54 | " --num-indices-per-lookup="${nindices}\
 55 | " --arch-interaction-op="${interaction}\
 56 | " --numpy-rand-seed="${rand_seed}\
 57 | " --print-freq="${print_freq}\
 58 | " --print-time"\
 59 | " --enable-profiling "
 60 | 
 61 | c2_args=" --caffe2-net-type="${c2_net}
 62 | 
 63 | 
 64 | # CPU Benchmarking
 65 | if [ $cpu = 1 ]; then
 66 |   echo "--------------------------------------------"
 67 |   echo "CPU Benchmarking - running on $ncores cores"
 68 |   echo "--------------------------------------------"
 69 |   if [ $pt = 1 ]; then
 70 |     outf="model1_CPU_PT_$ncores.log"
 71 |     outp="dlrm_s_pytorch.prof"
 72 |     echo "-------------------------------"
 73 |     echo "Running PT (log file: $outf)"
 74 |     echo "-------------------------------"
 75 |     cmd="$numa_cmd $dlrm_pt_bin --mini-batch-size=$mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args $dlrm_extra_option > $outf"
 76 |     echo $cmd
 77 |     eval $cmd
 78 |     min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
 79 |     echo "Min time per iteration = $min"
 80 |     # move profiling file(s)
 81 |     mv $outp ${outf//".log"/".prof"}
 82 |     mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
 83 | 
 84 |   fi
 85 |   if [ $c2 = 1 ]; then
 86 |     outf="model1_CPU_C2_$ncores.log"
 87 |     outp="dlrm_s_caffe2.prof"
 88 |     echo "-------------------------------"
 89 |     echo "Running C2 (log file: $outf)"
 90 |     echo "-------------------------------"
 91 |     cmd="$numa_cmd $dlrm_c2_bin --mini-batch-size=$mb_size $_args $c2_args $dlrm_extra_option 1> $outf 2> $outp"
 92 |     echo $cmd
 93 |     eval $cmd
 94 |     min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
 95 |     echo "Min time per iteration = $min"
 96 |     # move profiling file (collected from stderr above)
 97 |     mv $outp ${outf//".log"/".prof"}
 98 |   fi
 99 | fi
100 | 
101 | # GPU Benchmarking
102 | if [ $gpu = 1 ]; then
103 |   echo "--------------------------------------------"
104 |   echo "GPU Benchmarking - running on $ngpus GPUs"
105 |   echo "--------------------------------------------"
106 |   for _ng in $ngpus
107 |   do
108 |     # weak scaling
109 |     # _mb_size=$((mb_size*_ng))
110 |     # strong scaling
111 |     _mb_size=$((mb_size*1))
112 |     _gpus=$(seq -s, 0 $((_ng-1)))
113 |     cuda_arg="CUDA_VISIBLE_DEVICES=$_gpus"
114 |     echo "-------------------"
115 |     echo "Using GPUS: "$_gpus
116 |     echo "-------------------"
117 |     if [ $pt = 1 ]; then
118 |       outf="model1_GPU_PT_$_ng.log"
119 |       outp="dlrm_s_pytorch.prof"
120 |       echo "-------------------------------"
121 |       echo "Running PT (log file: $outf)"
122 |       echo "-------------------------------"
123 |       cmd="$cuda_arg $dlrm_pt_bin --mini-batch-size=$_mb_size --test-mini-batch-size=$tmb_size --test-num-workers=$tnworkers $_args --use-gpu $dlrm_extra_option > $outf"
124 |       echo $cmd
125 |       eval $cmd
126 |       min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
127 |       echo "Min time per iteration = $min"
128 |       # move profiling file(s)
129 |       mv $outp ${outf//".log"/".prof"}
130 |       mv ${outp//".prof"/".json"} ${outf//".log"/".json"}
131 |     fi
132 |     if [ $c2 = 1 ]; then
133 |       outf="model1_GPU_C2_$_ng.log"
134 |       outp="dlrm_s_caffe2.prof"
135 |       echo "-------------------------------"
136 |       echo "Running C2 (log file: $outf)"
137 |       echo "-------------------------------"
138 |       cmd="$cuda_arg $dlrm_c2_bin --mini-batch-size=$_mb_size $_args $c2_args --use-gpu $dlrm_extra_option 1> $outf 2> $outp"
139 |       echo $cmd
140 |       eval $cmd
141 |       min=$(grep "iteration" $outf | awk 'BEGIN{best=999999} {if (best > $7) best=$7} END{print best}')
142 |       echo "Min time per iteration = $min"
143 |       # move profiling file (collected from stderr above)
144 |       mv $outp ${outf//".log"/".prof"}
145 |     fi
146 |   done
147 | fi
148 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_kaggle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python3 dlrm_s_pytorch.py"  # python -u  : so that the tqdm output will be on terminal
18 | # dlrm_c2_bin="python3 dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=36 --arch-mlp-bot="13-512-256-64-36" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=1 --test-num-workers=0 $dlrm_extra_option 2>&1 
25 | 
26 | # echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | # $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
31 | 
32 | echo "finished!"
33 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_kaggle_C1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | # echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python3 dlrm_s_pytorch_C1.py"
18 | # dlrm_c2_bin="python3 dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch C1 ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=36 --arch-mlp-bot="13-512-256-64-36" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=1 --test-num-workers=0 $dlrm_extra_option 2>&1 
25 | 
26 | # echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | # $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
31 | 
32 | echo "finished!"
33 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_kaggle_C1_C2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | CURR_DIR=`pwd`
10 | 
11 | # check if the command contains "cpp_algo_socket"
12 | if [[ $1 == *"cpp_algo_socket"* ]]; then
13 |     # using socket interface
14 |     echo "The CPP caching layer is started by the script below ..."
15 |     echo "Will use SOCKET as the interface"
16 | 
17 |     cd /mnt/extra/ev-store-dlrm/mixed_precs_caching
18 |     g++ -O3 evlfu_4.cpp evlfu_8.cpp evlfu_16.cpp evlfu_32.cpp aprx_embedding.cpp cache_manager.cpp -pthread; ./a.out &
19 | else
20 |     # Each experiment might have different cacheSize, thus we recompile it
21 |     echo "Compile the C++ shared library ... "
22 |     echo "Will use Ctypes as the interface"
23 |     cd /mnt/extra/ev-store-dlrm/mixed_precs_caching
24 |     g++ -shared -o libcachemanager.so -fPIC -O3 evlfu_4.cpp evlfu_8.cpp evlfu_16.cpp evlfu_32.cpp aprx_embedding.cpp cache_manager.cpp -pthread; mv *.so lib/
25 |     echo "C++ shared library (*.so) is updated!"
26 | 
27 |     # check if this DLRM deployment wants to use specific libcachemanager naming [To enbale multi DLRM deployment]
28 |     if [ -z "$2" ]; then
29 |         echo "No need to rename the .so"
30 |     else
31 |         echo "COPY lib/libcachemanager.so -> lib/$2" # will be use by Ctypes!
32 |         cp lib/libcachemanager.so lib/$2
33 |     fi
34 | fi
35 | 
36 | cd $CURR_DIR
37 | #check if extra argument is passed to the test
38 | if [ -z "$1" ]; then
39 |     dlrm_extra_option=""
40 | else
41 |     dlrm_extra_option=$1
42 | fi
43 | # echo $dlrm_extra_option
44 | 
45 | dlrm_pt_bin="python3 dlrm_s_pytorch_C1_C2.py"
46 | # dlrm_c2_bin="python3 dlrm_s_caffe2.py"
47 | 
48 | echo "run pytorch C1_C2 ..."
49 | # WARNING: the following parameters will be set based on the data set
50 | # --arch-embedding-size=... (sparse feature sizes)
51 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
52 | $dlrm_pt_bin --arch-sparse-feature-size=36 --arch-mlp-bot="13-512-256-64-36" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=1 --test-num-workers=0 $dlrm_extra_option 2>&1 
53 | 
54 | # echo "run caffe2 ..."
55 | # WARNING: the following parameters will be set based on the data set
56 | # --arch-embedding-size=... (sparse feature sizes)
57 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
58 | # $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
59 | 
60 | echo "finished!"
61 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_kaggle_C1_C2_C3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | CURR_DIR=`pwd`
10 | 
11 | # check if the command contains "cpp_algo_socket"
12 | if [[ $1 == *"cpp_algo_socket"* ]]; then
13 |     # using socket interface
14 |     echo "The CPP caching layer is started by the script below ..."
15 |     echo "Will use SOCKET as the interface"
16 | 
17 |     cd /mnt/extra/ev-store-dlrm/mixed_precs_caching
18 |     g++ -O3 evlfu_4.cpp evlfu_8.cpp evlfu_16.cpp evlfu_32.cpp aprx_embedding.cpp cache_manager.cpp -pthread; ./a.out &
19 | else
20 |     # Each experiment might have different cacheSize, thus we recompile it
21 |     echo "Compile the C++ shared library ... "
22 |     echo "Will use Ctypes as the interface"
23 |     cd /mnt/extra/ev-store-dlrm/mixed_precs_caching
24 |     g++ -shared -o libcachemanager.so -fPIC -O3 evlfu_4.cpp evlfu_8.cpp evlfu_16.cpp evlfu_32.cpp aprx_embedding.cpp cache_manager.cpp -pthread; mv *.so lib/
25 |     echo "C++ shared library (*.so) is updated!"
26 | 
27 |     # check if this DLRM deployment wants to use specific libcachemanager naming [To enbale multi DLRM deployment]
28 |     if [ -z "$2" ]; then
29 |         echo "No need to rename the .so"
30 |     else
31 |         echo "COPY lib/libcachemanager.so -> lib/$2" # will be use by Ctypes!
32 |         cp lib/libcachemanager.so lib/$2
33 |     fi
34 | fi
35 | 
36 | cd $CURR_DIR
37 | #check if extra argument is passed to the test
38 | if [ -z "$1" ]; then
39 |     dlrm_extra_option=""
40 | else
41 |     dlrm_extra_option=$1
42 | fi
43 | 
44 | dlrm_pt_bin="python3 dlrm_s_pytorch_C1_C2_C3.py"
45 | # dlrm_c2_bin="python3 dlrm_s_caffe2.py"
46 | 
47 | echo "run pytorch C1_C2_C3 ..."
48 | # WARNING: the following parameters will be set based on the data set
49 | # --arch-embedding-size=... (sparse feature sizes)
50 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
51 | $dlrm_pt_bin --arch-sparse-feature-size=36 --arch-mlp-bot="13-512-256-64-36" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=1 --test-num-workers=0 $dlrm_extra_option 2>&1 
52 | 
53 | # echo "run caffe2 ..."
54 | # WARNING: the following parameters will be set based on the data set
55 | # --arch-embedding-size=... (sparse feature sizes)
56 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
57 | # $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
58 | 
59 | echo "finished!"
60 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_kaggle_lock_gpu_C1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | # echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python3 dlrm_s_pytorch_lock_gpu_C1.py"
18 | # dlrm_c2_bin="python3 dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch C1 ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=36 --arch-mlp-bot="13-512-256-64-36" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time --test-mini-batch-size=1 --test-num-workers=0 $dlrm_extra_option 2>&1 
25 | 
26 | # echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | # $dlrm_c2_bin --arch-sparse-feature-size=16 --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1" --data-generation=dataset --data-set=kaggle --raw-data-file=./input/train.txt --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_kaggle_c2.log
31 | 
32 | echo "finished!"
33 | 


--------------------------------------------------------------------------------
/bench/dlrm_s_criteo_terabyte.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_pt_bin="python dlrm_s_pytorch.py"
18 | dlrm_c2_bin="python dlrm_s_caffe2.py"
19 | 
20 | echo "run pytorch ..."
21 | # WARNING: the following parameters will be set based on the data set
22 | # --arch-embedding-size=... (sparse feature sizes)
23 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
24 | $dlrm_pt_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time --test-mini-batch-size=16384 --test-num-workers=16 $dlrm_extra_option 2>&1 | tee run_terabyte_pt.log
25 | 
26 | echo "run caffe2 ..."
27 | # WARNING: the following parameters will be set based on the data set
28 | # --arch-embedding-size=... (sparse feature sizes)
29 | # --arch-mlp-bot=... (the input to the first layer of bottom mlp)
30 | $dlrm_c2_bin --arch-sparse-feature-size=64 --arch-mlp-bot="13-512-256-64" --arch-mlp-top="512-512-256-1" --max-ind-range=10000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1024 --print-time $dlrm_extra_option 2>&1 | tee run_terabyte_c2.log
31 | 
32 | echo "done"
33 | 


--------------------------------------------------------------------------------
/bench/run_and_time.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | python dlrm_s_pytorch.py --arch-sparse-feature-size=128 --arch-mlp-bot="13-512-256-128" --arch-mlp-top="1024-1024-512-256-1" --max-ind-range=40000000 --data-generation=dataset --data-set=terabyte --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz --loss-function=bce --round-targets=True --learning-rate=1.0 --mini-batch-size=2048 --print-freq=2048 --print-time --test-freq=102400 --test-mini-batch-size=16384 --test-num-workers=16 --memory-map --mlperf-logging --mlperf-auc-threshold=0.8025 --mlperf-bin-loader --mlperf-bin-shuffle $dlrm_extra_option 2>&1 | tee run_terabyte_mlperf_pt.log
18 | 
19 | echo "done"
20 | 


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/EvLFU.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucare-uchicago/ev-store-dlrm/0954b2cb26a7e4ad1dddcdc3f98480e7d7e16ab5/cache_algo/EvLFU_C1_Cython/EvLFU.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/EvLFU.pyx:
--------------------------------------------------------------------------------
 1 | from libcpp.vector cimport vector
 2 | from libcpp cimport bool
 3 | from libcpp.string cimport string
 4 | 
 5 | cdef extern from "evlfu.hpp":
 6 |     void init(int capacity)
 7 |     void request_to_ev_lfu(vector[int] &group_keys, vector[bool] &arr_record_hit, vector[vector[float]] &arr_emb_weights, bool use_gpu)
 8 |     void load_ev_tables()
 9 |     void close_ev_tables()
10 | 
11 | def cinit(int capacity):
12 |     init(capacity)
13 | 
14 | def crequest(vector[int] group_keys, use_gpu):
15 |     cdef vector[bool] arr_record_hit = [True] * 26
16 |     cdef vector[vector[float]] arr_emb_weights = [[0.0]*36]*26
17 |     request_to_ev_lfu(group_keys, arr_record_hit, arr_emb_weights, use_gpu)
18 |     return arr_record_hit, arr_emb_weights
19 | 
20 | 
21 | def cload_ev_tables():
22 |     load_ev_tables()
23 | 
24 | def cclose_ev_tables():
25 |     close_ev_tables()


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/evlfu.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EVLFU_H_INCLUDED
 3 | #define EVLFU_H_INCLUDED
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <list>
 8 | #include <unordered_map>
 9 | #include <vector>
10 | #include <fstream>
11 | #include <ctime>
12 | 
13 | using namespace std;
14 | 
15 | struct Cache_data
16 | {
17 |     Cache_data(vector<float> ev = vector<float>(0), int agg_hit = 0)
18 |     {
19 |         this->embedding_value = ev;
20 |         this->agg_hit = agg_hit;
21 |     }
22 |     vector<float> embedding_value;
23 |     int agg_hit;
24 | };
25 | 
26 | void init(int capacity);
27 | void request_to_ev_lfu(vector<int> &group_keys, vector<bool> &arr_record_hit, vector<vector<float>> &arr_emb_weights, bool use_gpu);
28 | void load_ev_tables();
29 | void close_ev_tables();
30 | 
31 | #endif


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/script.sh:
--------------------------------------------------------------------------------
1 | python setup_EvLFU.py build_ext --inplace
2 | python test.py 


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/setup_EvLFU.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | from distutils.extension import Extension
 4 | from Cython.Distutils import build_ext
 5 | # extensions = [
 6 | #     Extension('EvLFU', ['EvLFU.pyx', 'evlfu_v2.cpp'],
 7 | #               extra_compile_args=['-std=c++11'],
 8 | #               language='c++'
 9 | #               ),
10 | # ]
11 | 
12 | # setup(
13 | #     ext_modules=cythonize(extensions),
14 | #     # extra_compile_args=["-w", '-g'],
15 | #     # extra_compile_args=["-O3"],
16 | # )
17 | 
18 | ext_modules = [Extension("EvLFU", ["EvLFU.pyx", "evlfu.cpp"], language='c++',)]
19 | 
20 | setup(cmdclass = {'build_ext': build_ext}, ext_modules = ext_modules)


--------------------------------------------------------------------------------
/cache_algo/EvLFU_C1_Cython/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import numpy as np
 4 | import pandas as pd
 5 | import time
 6 | import random
 7 | import EvLFU
 8 | 
 9 | 
10 | workload_dir = "/home/cc/workload/Archive-new-1.0M/"
11 | workload_files = []
12 | for i in range(1, 27):
13 |     workload_files.append("workload-group-" + str(i) + ".csv")
14 | arrRawWorkload = []
15 | # read all workloads:
16 | for workload_file in workload_files:
17 |     workload = np.asarray(pd.read_csv(workload_dir + workload_file).values[:, 0])
18 |     arrRawWorkload.append(workload)
19 | 
20 | arrRawWorkload = np.asarray(arrRawWorkload)
21 | # print(arrRawWorkload.shape)
22 | # merge the workloads
23 | arrMergedWorkload = np.stack(arrRawWorkload, axis=1)
24 | groupedWorkloadKeys = arrMergedWorkload
25 | print(arrMergedWorkload.shape)
26 | print("Done merging ALL workloads: total = ", arrRawWorkload.shape[0], 'rows')
27 | 
28 | # Run the alg:
29 | EvLFU.cinit(768)
30 | prefectHit = 0
31 | 
32 | groupedWorkloadIds = []
33 | 
34 | for groupKeys in groupedWorkloadKeys:
35 |     groupKeys = groupKeys.tolist()
36 |     for i in range(26):
37 |         groupKeys[i] = int(groupKeys[i].split('-')[1])
38 | 
39 |     groupedWorkloadIds.append(groupKeys)
40 | 
41 | EvLFU.cload_ev_tables()
42 | start_time = time.time()
43 | 
44 | for group_row_ids in groupedWorkloadIds:
45 |     # print(type(groupKeys))
46 |     # print(type(groupKeys[0]))
47 |     aggHitMissRecord, x = EvLFU.crequest(group_row_ids, False)
48 |     # print(aggHitMissRecord)
49 |     # print(x)
50 |     # exit(0)
51 |     flag = True
52 |     for isHit in aggHitMissRecord:
53 |         if not isHit:
54 |             flag = False
55 |             break
56 |     if flag:
57 |         prefectHit += 1
58 | print("perfect hit:", prefectHit)
59 | print(time.time() - start_time)
60 | EvLFU.cclose_ev_tables()


--------------------------------------------------------------------------------
/cache_algo/LFU.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | cap = -1
 7 | least_freq = 1
 8 | # to store the frequency of the keys 
 9 | node_for_freq = []
10 | # to search the key within all the cached keys
11 | node_for_key = dict()
12 | 
13 | def init(capacity):
14 |     global cap 
15 |     cap = capacity
16 |     node_for_freq.append(0)# For the frequency == 0 : USELESS
17 |     node_for_freq.append([]) # For the frequency == 1
18 | 
19 | def _update( key, value, freq):
20 |     # increment the frequency
21 |     global node_for_key, node_for_freq, least_freq
22 |     # remove the key from the old frequency
23 |     node_for_freq[freq].remove(key)
24 | 
25 |     if len(node_for_freq[least_freq]) == 0:
26 |         # update the least_freq if there is no more item in this frequency list
27 |         # node_for_freq.pop(least_freq) # remove this empty freq list, to save memory
28 |         least_freq += 1
29 |     
30 |     # update frequency
31 |     node_for_key[key][1] = freq + 1
32 |     # use a +1 because the idx 0 is not used 
33 |     if ((freq + 1) == len(node_for_freq)):
34 |         node_for_freq.append([])
35 |     node_for_freq[freq + 1].append(key)
36 | 
37 | def set( key, value):
38 |     global node_for_key, node_for_freq, cap, least_freq
39 | 
40 |     # check if full
41 |     if (len(node_for_key) >= cap):
42 |         # evict 1 item
43 |         key_to_remove = node_for_freq[least_freq].pop(0)
44 |         node_for_key.pop(key_to_remove)
45 | 
46 |     # Insert the new item
47 |     node_for_key[key] = [value, 1]
48 |     node_for_freq[1].append(key)
49 | 
50 |     # update least freq
51 |     least_freq = 1
52 | 
53 | def request(key, table_id, row_id):
54 |     global node_for_key, node_for_freq
55 |     # check if the key is cached
56 |     if key in node_for_key:
57 |         # Yes, get the value
58 |         value, freq = node_for_key[key]
59 |         # Update item's frequency
60 |         _update(key, value, freq)
61 |         return value, True
62 |     else:
63 |         # MISS: get value from secondary storage
64 |         value = storage_manager.get_val_from_storage(table_id, row_id)
65 |         set(key, value)
66 |         return value, False
67 | 
68 | # Multi keys request
69 | def request_to_lfu( group_row_ids, use_gpu = False):
70 |     arr_record_hit = []
71 |     arr_emb_weights = []
72 |     agg_hit = 0
73 |     if (use_gpu):
74 |         # This code assume that we only run this on a single GPU node
75 |         device = torch.device("cuda:0")
76 | 
77 |     for i, row_id in enumerate(group_row_ids):
78 |         # Table_id is started at 1
79 |         # Key for row3 of table1 is 1-3
80 |         key = str(i+1) + "-" + str(row_id)
81 |         val, is_hit = request(key, i+1, row_id)
82 |         # convert list of embedding values to tensor 
83 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
84 |         ev_tensor.requires_grad = True
85 |         if (use_gpu):
86 |             # This code assume that we only run this on a single GPU node
87 |             ev_tensor = ev_tensor.to(device)
88 |         arr_emb_weights.append(ev_tensor)
89 |         if is_hit:
90 |             arr_record_hit.append(True)
91 |             agg_hit += 1
92 |         else:
93 |             arr_record_hit.append(False)
94 |     
95 |     return arr_record_hit, arr_emb_weights
96 | 
97 | 


--------------------------------------------------------------------------------
/cache_algo/LRU.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | 
 7 | cap = -1
 8 | LRUCache = collections.OrderedDict()# each item is a dictionary embedding value
 9 | 
10 | def init(capacity):
11 |     global cap 
12 |     cap = capacity
13 | 
14 | # Inserting the NEW key
15 | def set(key, value):
16 |     global LRUCache, cap
17 |     if (len(LRUCache) >= cap):
18 |         # evicting the first key in LRU list
19 |         evict_key, evict_val = LRUCache.popitem(last=False)
20 |     # inserting new key
21 |     LRUCache[key] = value
22 | 
23 | # single key request
24 | def request(key, table_id, row_id):
25 |     global LRUCache
26 |     if (key in LRUCache):
27 |         value = LRUCache[key]
28 |         # Update position of the hit item to first. Optional.
29 |         LRUCache.move_to_end(key, last=True)
30 |         return value, True
31 |     else:
32 |         # MISS: get value from secondary storage
33 |         value = storage_manager.get_val_from_storage(table_id, row_id)
34 |         set(key, value)
35 |         return value, False
36 | 
37 | # Multi keys request
38 | def request_to_lru( group_row_ids, use_gpu = False):
39 |     arr_record_hit = []
40 |     arr_emb_weights = []
41 |     agg_hit = 0
42 |     if (use_gpu):
43 |         # This code assume that we only run this on a single GPU node
44 |         device = torch.device("cuda:0")
45 | 
46 |     for i, row_id in enumerate(group_row_ids):
47 |         # Table_id is started at 1
48 |         # Key for row3 of table1 is 1-3
49 |         key = str(i+1) + "-" + str(row_id)
50 |         val, is_hit = request(key, i+1, row_id)
51 |         # convert list of embedding values to tensor 
52 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
53 |         ev_tensor.requires_grad = True
54 |         if (use_gpu):
55 |             # This code assume that we only run this on a single GPU node
56 |             ev_tensor = ev_tensor.to(device)
57 |         arr_emb_weights.append(ev_tensor)
58 |         if is_hit:
59 |             arr_record_hit.append(True)
60 |             agg_hit += 1
61 |         else:
62 |             arr_record_hit.append(False)
63 |     
64 |     return arr_record_hit, arr_emb_weights
65 | 
66 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/EvLFU_C1_v0.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import torch
  3 | import sys
  4 | sys.path.append('emb_storage')
  5 | import storage_manager
  6 | 
  7 | ###########################################EvLFU##########################################
  8 | cap_C1 = 500
  9 | min_C1 = 0
 10 | vals_C1 = dict()
 11 | counts_C1 = dict()
 12 | lists_C1 = dict()
 13 | lists_C1[0] = []
 14 | # flushing part:
 15 | nPerfectItem_C1 = 0
 16 | flushRate_C1 = 0.4
 17 | perfectItemCapacity_C1 = 1.0
 18 | 
 19 | def init():
 20 |     pass
 21 | 
 22 | def set(key, value, aggHit):
 23 |     global cap_C1, min_C1, vals_C1, counts_C1, lists_C1, nPerfectItem_C1, flushRate_C1, perfectItemCapacity_C1
 24 |     if cap_C1 <= 0:
 25 |         return
 26 |     if vals_C1.get(key) is not None:
 27 |         vals_C1[key] = value
 28 |         get_val_from_mem(key, aggHit)
 29 |         return
 30 | 
 31 |     # Flushing:
 32 |     if nPerfectItem_C1 >= int(cap_C1 * perfectItemCapacity_C1):
 33 |         # print("flushing!")
 34 |         for i in range(0, int(flushRate_C1 * cap_C1) + 1):
 35 |             evictKey = lists_C1.get(26)[0]
 36 |             lists_C1.get(26).remove(evictKey)
 37 |             vals_C1.pop(evictKey)
 38 |             counts_C1.pop(evictKey)
 39 | 
 40 |         nPerfectItem_C1 = len(lists_C1.get(26))
 41 |         if len(vals_C1) < cap_C1:
 42 |             min_C1 = aggHit
 43 | 
 44 |     # key allows to insert in the cache:
 45 |     if len(vals_C1) >= cap_C1:
 46 |         evictKey = lists_C1.get(min_C1)[0] # TODO: Use pop!!
 47 |         # print("lists_C1.get(min_C1 = " + str(min_C1) + ") = " + str(lists_C1.get(min_C1)))
 48 |         lists_C1.get(min_C1).remove(evictKey)
 49 |         try:
 50 |             vals_C1.pop(evictKey)
 51 |         except:
 52 |             print("KeyError when vals_C1.pop key =" + str(evictKey))
 53 |             print(vals_C1.keys())
 54 |             print("cap_C1 " + str(cap_C1))
 55 |             print(lists_C1.get(min_C1))
 56 |             exit(-1)
 57 |         try:
 58 |             counts_C1.pop(evictKey)
 59 |         except:
 60 |             print("KeyError when counts_C1.pop key =" + str(evictKey))
 61 |             exit(-1)
 62 | 
 63 |     # If the key is new, insert the value:
 64 |     vals_C1[key] = value
 65 |     counts_C1[key] = aggHit
 66 | 
 67 |     if lists_C1.get(aggHit) is None:
 68 |         lists_C1[aggHit] = []
 69 |         lists_C1 = dict(sorted(lists_C1.items()))
 70 |     # if (key in lists_C1[aggHit]):
 71 |     #     print("aggHit = " + str(aggHit))
 72 |     #     print(lists_C1[aggHit])
 73 |     #     print("ERROR 1: key already in lists, no need to append " + key)
 74 |     #     exit(-1)
 75 |     lists_C1.get(aggHit).append(key) # ========
 76 | 
 77 |     # Update minimum agghit
 78 |     if aggHit < min_C1:
 79 |         min_C1 = aggHit
 80 |     while (lists_C1.get(min_C1) is None) or len(lists_C1.get(min_C1)) == 0:
 81 |         min_C1 += 1
 82 | 
 83 | def get_val_from_mem(key, aggHit):  # Get From Mem
 84 |     global cap_C1, min_C1, vals_C1, counts_C1, lists_C1, nPerfectItem_C1, flushRate_C1, perfectItemCapacity_C1
 85 |     if vals_C1.get(key) is None:
 86 |         return None
 87 |     count = counts_C1.get(key)
 88 |     newCount = count
 89 |     if count < aggHit:
 90 |         newCount = aggHit
 91 |     counts_C1[key] = newCount
 92 |     lists_C1.get(count).remove(key)
 93 | 
 94 |     if count == min_C1:
 95 |         while (lists_C1.get(min_C1) is None) or len(lists_C1.get(min_C1)) == 0:
 96 |             min_C1 += 1
 97 |     if lists_C1.get(newCount) is None:
 98 |         lists_C1[newCount] = []
 99 |         lists_C1 = dict(sorted(lists_C1.items()))
100 |     # if (key in lists_C1[newCount]):
101 |     #     print("newCount = " + str(newCount))
102 |     #     print(lists_C1[newCount])
103 |     #     print("ERROR 3: key already in lists, no need to append " + key)
104 |     #     exit(-1)
105 |     lists_C1.get(newCount).append(key) # ========
106 |     return vals_C1[key]
107 | 
108 | def update(key, tableId, rowId, aggHit, nGroup):
109 |     # Get value from EV-LFU cache
110 |     val = get_val_from_mem(key, aggHit)
111 |     if val is None:
112 |         # On MISS
113 |         # Get value from secondary storage
114 |         # ADDING IF CONDITION HERE IS SLOW!
115 |         # if (storage_manager.storage_type == storage_manager.EmbStorage.DUMMY):
116 |             # Dummy storage will always use tableid + rowId because the data are stored in 26 tables
117 |         val = storage_manager.get_val_from_storage(tableId, rowId)
118 |         # else :
119 |             # faster for rocksdb
120 |             # val = storage_manager.get_val_from_storage_by_key(key) #only on rocksdb 
121 |         set(key, val, aggHit)
122 |     return val
123 | 
124 | def request_to_ev_lfu( group_rowIds, use_gpu = False):
125 |     recordHitOrMiss = []
126 |     group_keys = []
127 |     missing_keys = []
128 |     emb_weights = []
129 |     aggHit = 0
130 |     global cap_C1, min_C1, vals_C1, counts_C1, lists_C1, nPerfectItem_C1, flushRate_C1, perfectItemCapacity_C1
131 |     for i, rowId in enumerate(group_rowIds):
132 |         # TableId is started at 1
133 |         # Key for row3 of table1 is 1-3
134 |         key = str(i+1) + "-" + str(rowId)
135 |         group_keys.append(key)
136 |         if vals_C1.get(key) is not None:
137 |             recordHitOrMiss.append(True)
138 |             aggHit += 1
139 |         else:
140 |             missing_keys.append(key)
141 |             recordHitOrMiss.append(False)
142 |     
143 |     # TODO: Get the missing keys from storage
144 |     # missing_keys
145 | 
146 |     if (use_gpu):
147 |         # This code assume that we only run this on a single GPU node
148 |         device = torch.device("cuda:0")
149 | 
150 |     for i, rowId in enumerate(group_rowIds):
151 |         # The tableId is started at 1 instead of 0
152 |         val = update(group_keys[i], i + 1, rowId, aggHit, len(recordHitOrMiss)) # the data could either come from EV-LFU or MemStor or PyrocksDB
153 |         # convert list of embedding values to tensor 
154 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
155 |         ev_tensor.requires_grad = True
156 |         if (use_gpu):
157 |             ev_tensor = ev_tensor.to(device)
158 |         emb_weights.append(ev_tensor)
159 | 
160 |     if lists_C1.get(26) and not len(lists_C1.get(26)) == 0:
161 |         nPerfectItem_C1 = len(lists_C1.get(26))
162 | 
163 |     return recordHitOrMiss, emb_weights
164 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/EvLFU_C1_v1.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import sys
  3 | sys.path.append('emb_storage')
  4 | import storage_manager
  5 | import random
  6 | 
  7 | ###########################################EvLFU##########################################
  8 | cap_C1 = -1
  9 | min_C1 = 0
 10 | vals_C1 = dict() # each value is [embedding value, agg_hit]
 11 | lists_C1 = dict() # will group the keys based on the agg_hit (count)
 12 | # flushing part:
 13 | n_perfect_item_C1 = 0
 14 | flush_rate_C1 = 0.4
 15 | perfect_item_cap_C1 = 1.0
 16 | max_perfect_item_C1 = 0
 17 | 
 18 | def init(capacity):
 19 |     global lists_C1, max_perfect_item_C1, perfect_item_cap_C1, cap_C1
 20 |     cap_C1 = capacity
 21 |     # initializing the dict
 22 |     i = 0
 23 |     while (i <= 26):
 24 |         lists_C1[i] = []
 25 |         i += 1
 26 |     max_perfect_item_C1 = int(cap_C1 * perfect_item_cap_C1)
 27 | 
 28 | # Inserting the NEW key
 29 | def set(key, value, agg_hit):
 30 |     global cap_C1, min_C1, vals_C1, lists_C1, n_perfect_item_C1, max_perfect_item_C1, flush_rate_C1
 31 | 
 32 |     # Flushing:
 33 |     if n_perfect_item_C1 >= max_perfect_item_C1:
 34 |         print("flushing!")
 35 |         print("n_perfect_item_C1 = " + str(n_perfect_item_C1))
 36 |         print("max_perfect_item_C1 = " + str(max_perfect_item_C1))
 37 |         for i in range(0, int(flush_rate_C1 * cap_C1) + 1):
 38 |             key_to_evict = lists_C1[26].pop(0)
 39 |             vals_C1.pop(key_to_evict)
 40 |         # adjust the n_perfect_item counter
 41 |         n_perfect_item_C1 = len(lists_C1[26])
 42 |     else:
 43 |         # cache is full
 44 |         if len(vals_C1) >= cap_C1:
 45 |             # make a space for the new key
 46 |             while(lists_C1[min_C1] == []):
 47 |                 # find the right key to pop
 48 |                 # Update minimum agg_hit
 49 |                 min_C1 += 1
 50 |                 if (min_C1 > 26):
 51 |                     min_C1 = 1
 52 |             key_to_evict = lists_C1[min_C1].pop(0)
 53 |             vals_C1.pop(key_to_evict)
 54 | 
 55 |     # insert the new value:
 56 |     vals_C1[key] = [value, agg_hit]
 57 |     lists_C1[agg_hit].append(key) # ========
 58 | 
 59 |     if agg_hit < min_C1:
 60 |         min_C1 = agg_hit
 61 |     
 62 | def update_agg_hit(key, agg_hit):  # Get From Mem
 63 |     global vals_C1, lists_C1
 64 |     ev_vals = vals_C1.get(key)
 65 |     if ev_vals is None:
 66 |         return None
 67 |     # old_agg_hit = ev_vals[1]
 68 |     if ev_vals[1] < agg_hit:
 69 |         # update the old agg_hit
 70 |         lists_C1[ev_vals[1]].remove(key)
 71 |         lists_C1[agg_hit].append(key) # ========
 72 |         vals_C1[key][1] = agg_hit
 73 |         # Increase the min_freq if the current lists freq is []
 74 |             # Nope, the new aggHit can jump, No need to do anything
 75 |     return ev_vals[0]
 76 | 
 77 | # Updating the existing keys and inserting the missing keys
 78 | def update(key, table_id, row_id, agg_hit, missing_value = None):
 79 |     # TODO: This can be done in multi threaded way (on Java and C++)
 80 |     # Get value from EV-LFU cache
 81 |     val = update_agg_hit(key, agg_hit)
 82 |     if val:
 83 |         return val
 84 |     else:
 85 |         # On MISS: Get value from secondary storage
 86 |         # DON't put "IF CONDITION" HERE! IT IS SLOW!
 87 |         if missing_value is None:
 88 |             # this key might be kicked out while inserting previous key
 89 |             missing_value = storage_manager.get_val_from_storage(table_id, row_id)
 90 |             # missing_value = storage_manager.get_val_from_storage_by_key(key) #only on rocksdb 
 91 |         set(key, missing_value, agg_hit)
 92 |         return missing_value
 93 | 
 94 | def request_to_ev_lfu( group_row_ids, use_gpu = False, approx_emb_thres = -1, ev_dim = 36):
 95 |     arr_record_hit = []
 96 |     arr_group_keys = []
 97 |     arr_missing_keys = []
 98 |     arr_missing_values = []
 99 |     arr_emb_weights = []
100 |     pick_random_ev = False
101 |     agg_hit = 0
102 |     global vals_C1, lists_C1, n_perfect_item_C1
103 |     for i, row_id in enumerate(group_row_ids):
104 |         # Table_id is started at 1
105 |         # Key for row3 of table1 is 1-3
106 |         key = str(i+1) + "-" + str(row_id)
107 |         arr_group_keys.append(key)
108 |         if key in vals_C1.keys():
109 |             arr_record_hit.append(True)
110 |             agg_hit += 1
111 |         else:
112 |             arr_missing_keys.append([i+1, row_id])
113 |             arr_record_hit.append(False)
114 |     
115 |     # Get all missing keys from storage at once
116 |     arr_missing_values = storage_manager.get_arr_val_from_storage(arr_missing_keys)
117 | 
118 |     if (use_gpu):
119 |         # This code assume that we only run this on a single GPU node
120 |         device = torch.device("cuda:0")
121 | 
122 |     # Update
123 |     for i, row_id in enumerate(group_row_ids):
124 |         # TODO: C++ and java code should do this in multithreaded way
125 |         # The table_id is started at 1 instead of 0
126 |         if (arr_record_hit[i]):
127 |             val = update(arr_group_keys[i], i + 1, row_id, agg_hit) 
128 |         else:
129 |             # plug the missing values that we get from secondary storage
130 |             val = update(arr_group_keys[i], i + 1, row_id, agg_hit, arr_missing_values.pop(0)) 
131 |         # convert list of embedding values to tensor 
132 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
133 |         ev_tensor.requires_grad = True
134 |         if (use_gpu):
135 |             ev_tensor = ev_tensor.to(device)
136 |         arr_emb_weights.append(ev_tensor)
137 | 
138 |     if agg_hit == 26:
139 |         # update the number of perfect item
140 |         n_perfect_item_C1 = len(lists_C1[26])
141 |     return arr_record_hit, arr_emb_weights
142 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/LFU_v0.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | cap = -1
 7 | least_freq = 1
 8 | node_for_freq = collections.defaultdict(collections.OrderedDict)
 9 | node_for_key = dict()
10 | 
11 | def init(capacity):
12 |     global cap 
13 |     cap = capacity
14 | 
15 | def _update( key, value):
16 |     global node_for_key, node_for_freq, least_freq
17 |     _, freq = node_for_key[key]
18 |     node_for_freq[freq].pop(key)
19 |     if len(node_for_freq[least_freq]) == 0:
20 |         least_freq += 1
21 |     node_for_freq[freq+1][key] = (value, freq+1)
22 |     node_for_key[key] = (value, freq+1)
23 | 
24 | def set( key, value):
25 |     global node_for_key, node_for_freq, cap, least_freq
26 |     if (len(node_for_key) >= cap):
27 |         # evict 1 item
28 |         removed = node_for_freq[least_freq].popitem(last=False)
29 |         node_for_key.pop(removed[0])
30 |     # Insert the new item
31 |     node_for_key[key] = (value,1)
32 |     node_for_freq[1][key] = (value,1)
33 | 
34 | def request(key, table_id, row_id):
35 |     global node_for_key, node_for_freq
36 |     if key in node_for_key:
37 |         value = node_for_key[key][0]
38 |         # Update item's frequency
39 |         _update(key, value)
40 |         return value, True
41 |     else:
42 |         # MISS: get value from secondary storage
43 |         value = storage_manager.get_val_from_storage(table_id, row_id)
44 |         set(key, value)
45 |         return value, False
46 | 
47 | # Multi keys request
48 | def request_to_lfu( group_row_ids, use_gpu = False):
49 |     arr_record_hit = []
50 |     arr_emb_weights = []
51 |     agg_hit = 0
52 |     if (use_gpu):
53 |         # This code assume that we only run this on a single GPU node
54 |         device = torch.device("cuda:0")
55 | 
56 |     for i, row_id in enumerate(group_row_ids):
57 |         # Table_id is started at 1
58 |         # Key for row3 of table1 is 1-3
59 |         key = str(i+1) + "-" + str(row_id)
60 |         val, is_hit = request(key, i+1, row_id)
61 |         # convert list of embedding values to tensor 
62 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
63 |         ev_tensor.requires_grad = True
64 |         if (use_gpu):
65 |             # This code assume that we only run this on a single GPU node
66 |             ev_tensor = ev_tensor.to(device)
67 |         arr_emb_weights.append(ev_tensor)
68 |         if is_hit:
69 |             arr_record_hit.append(True)
70 |             agg_hit += 1
71 |         else:
72 |             arr_record_hit.append(False)
73 |     
74 |     return arr_record_hit, arr_emb_weights
75 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/LFU_v1.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | cap = -1
 7 | least_freq = 1
 8 | # to store the frequency of the keys 
 9 | node_for_freq = dict()
10 | # to search the key within all the cached keys
11 | node_for_key = dict()
12 | 
13 | def init(capacity):
14 |     global cap 
15 |     cap = capacity
16 |     node_for_freq[1] = collections.OrderedDict() # For the frequency == 1
17 | 
18 | def _update( key, value, freq):
19 |     # increment the frequency
20 |     global node_for_key, node_for_freq, least_freq
21 |     # remove the key from the old frequency
22 |     node_for_freq[freq].pop(key)
23 | 
24 |     if len(node_for_freq[least_freq]) == 0:
25 |         # update the least_freq if there is no more item in this frequency list
26 |         least_freq += 1
27 |         node_for_freq.pop(least_freq)
28 |     
29 |     # update frequency
30 |     node_for_key[key][1] = freq + 1
31 |     if ((freq + 1) not in node_for_freq.keys()):
32 |         node_for_freq[freq + 1] = collections.OrderedDict()
33 |     node_for_freq[freq + 1][key] = ""
34 | 
35 | def set( key, value):
36 |     global node_for_key, node_for_freq, cap, least_freq
37 | 
38 |     # check if full
39 |     if (len(node_for_key) >= cap):
40 |         # evict 1 item
41 |         key_to_remove = node_for_freq[least_freq].popitem(last=False)
42 |         node_for_key.pop(key_to_remove)
43 | 
44 |     # Insert the new item
45 |     node_for_key[key] = [value, 1]
46 |     node_for_freq[1][key] = ""
47 | 
48 |     # update least freq
49 |     least_freq = 1
50 | 
51 | def request(key, table_id, row_id):
52 |     global node_for_key, node_for_freq
53 |     # check if the key is cached
54 |     if key in node_for_key:
55 |         # Yes, get the value
56 |         value, freq = node_for_key[key]
57 |         # Update item's frequency
58 |         _update(key, value, freq)
59 |         return value, True
60 |     else:
61 |         # MISS: get value from secondary storage
62 |         value = storage_manager.get_val_from_storage(table_id, row_id)
63 |         set(key, value)
64 |         return value, False
65 | 
66 | # Multi keys request
67 | def request_to_lfu( group_row_ids, use_gpu = False):
68 |     arr_record_hit = []
69 |     arr_emb_weights = []
70 |     agg_hit = 0
71 |     if (use_gpu):
72 |         # This code assume that we only run this on a single GPU node
73 |         device = torch.device("cuda:0")
74 | 
75 |     for i, row_id in enumerate(group_row_ids):
76 |         # Table_id is started at 1
77 |         # Key for row3 of table1 is 1-3
78 |         key = str(i+1) + "-" + str(row_id)
79 |         val, is_hit = request(key, i+1, row_id)
80 |         # convert list of embedding values to tensor 
81 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
82 |         ev_tensor.requires_grad = True
83 |         if (use_gpu):
84 |             # This code assume that we only run this on a single GPU node
85 |             ev_tensor = ev_tensor.to(device)
86 |         arr_emb_weights.append(ev_tensor)
87 |         if is_hit:
88 |             arr_record_hit.append(True)
89 |             agg_hit += 1
90 |         else:
91 |             arr_record_hit.append(False)
92 |     
93 |     return arr_record_hit, arr_emb_weights
94 | 
95 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/LFU_v2.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | cap = -1
 7 | least_freq = 1
 8 | # to store the frequency of the keys 
 9 | node_for_freq = dict()
10 | # to search the key within all the cached keys
11 | node_for_key = dict()
12 | 
13 | def init(capacity):
14 |     global cap 
15 |     cap = capacity
16 |     node_for_freq[1] = [] # For the frequency == 1
17 | 
18 | def _update( key, value, freq):
19 |     # increment the frequency
20 |     global node_for_key, node_for_freq, least_freq
21 |     # remove the key from the old frequency
22 |     node_for_freq[freq].remove(key)
23 | 
24 |     if len(node_for_freq[least_freq]) == 0:
25 |         # update the least_freq if there is no more item in this frequency list
26 |         node_for_freq.pop(least_freq) # remove this empty freq list, to save memory
27 |         least_freq += 1
28 |     
29 |     # update frequency
30 |     node_for_key[key][1] = freq + 1
31 |     if ((freq + 1) not in node_for_freq.keys()):
32 |         node_for_freq[freq + 1] = []
33 |     node_for_freq[freq + 1].append(key)
34 | 
35 | def set( key, value):
36 |     global node_for_key, node_for_freq, cap, least_freq
37 | 
38 |     # check if full
39 |     if (len(node_for_key) >= cap):
40 |         # evict 1 item
41 |         key_to_remove = node_for_freq[least_freq].pop(0)
42 |         node_for_key.pop(key_to_remove)
43 | 
44 |     # Insert the new item
45 |     node_for_key[key] = [value, 1]
46 |     node_for_freq[1].append(key)
47 | 
48 |     # update least freq
49 |     least_freq = 1
50 | 
51 | def request(key, table_id, row_id):
52 |     global node_for_key, node_for_freq
53 |     # check if the key is cached
54 |     if key in node_for_key:
55 |         # Yes, get the value
56 |         value, freq = node_for_key[key]
57 |         # Update item's frequency
58 |         _update(key, value, freq)
59 |         return value, True
60 |     else:
61 |         # MISS: get value from secondary storage
62 |         value = storage_manager.get_val_from_storage(table_id, row_id)
63 |         set(key, value)
64 |         return value, False
65 | 
66 | # Multi keys request
67 | def request_to_lfu( group_row_ids, use_gpu = False):
68 |     arr_record_hit = []
69 |     arr_emb_weights = []
70 |     agg_hit = 0
71 |     if (use_gpu):
72 |         # This code assume that we only run this on a single GPU node
73 |         device = torch.device("cuda:0")
74 | 
75 |     for i, row_id in enumerate(group_row_ids):
76 |         # Table_id is started at 1
77 |         # Key for row3 of table1 is 1-3
78 |         key = str(i+1) + "-" + str(row_id)
79 |         val, is_hit = request(key, i+1, row_id)
80 |         # convert list of embedding values to tensor 
81 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
82 |         ev_tensor.requires_grad = True
83 |         if (use_gpu):
84 |             # This code assume that we only run this on a single GPU node
85 |             ev_tensor = ev_tensor.to(device)
86 |         arr_emb_weights.append(ev_tensor)
87 |         if is_hit:
88 |             arr_record_hit.append(True)
89 |             agg_hit += 1
90 |         else:
91 |             arr_record_hit.append(False)
92 |     
93 |     return arr_record_hit, arr_emb_weights
94 | 
95 | 


--------------------------------------------------------------------------------
/cache_algo/old_versions/LRU_v0.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import torch
 3 | import sys
 4 | sys.path.append('emb_storage')
 5 | import storage_manager
 6 | from functools import lru_cache
 7 | 
 8 | cap = -1
 9 | vals = dict() # each item is a dictionary embedding value
10 | lru_list = [] # store the order of the keys in LRU manner
11 | 
12 | def init(capacity):
13 |     global cap 
14 |     cap = capacity
15 | 
16 | # Inserting the NEW key
17 | def set(key, value):
18 |     global vals, lru_list, cap
19 | 
20 |     if (len(lru_list) >= cap):
21 |         # evicting the first key in LRU list
22 |         vals.pop(lru_list.pop(0))
23 |     
24 |     # inserting new key
25 |     vals[key] = value
26 |     lru_list.append(key)
27 | 
28 | # single key request
29 | def request(key, table_id, row_id):
30 |     global vals, lru_list
31 |     is_hit = False
32 |     value = vals.get(key)
33 |     if (value == None):
34 |         # MISS: get value from secondary storage
35 |         value = storage_manager.get_val_from_storage(table_id, row_id)
36 |         set(key, value)
37 |     else:
38 |         # update the position; put it in the back of the Q
39 |         lru_list.remove(key)
40 |         lru_list.append(key)
41 |         is_hit = True
42 |     return value, is_hit
43 | 
44 | @lru_cache(maxsize=5000)
45 | def request_memoization(key, table_id, row_id):
46 |     # MISS: get value from secondary storage
47 |     value = storage_manager.get_val_from_storage(table_id, row_id)
48 |     set(key, value)
49 |     return value, False
50 | 
51 | # Multi keys request
52 | def request_to_lru( group_row_ids, use_gpu = False):
53 |     arr_record_hit = []
54 |     arr_emb_weights = []
55 |     agg_hit = 0
56 |     if (use_gpu):
57 |         # This code assume that we only run this on a single GPU node
58 |         device = torch.device("cuda:0")
59 | 
60 |     for i, row_id in enumerate(group_row_ids):
61 |         # Table_id is started at 1
62 |         # Key for row3 of table1 is 1-3
63 |         key = str(i+1) + "-" + str(row_id)
64 |         # val, is_hit = request_memoization(key, i+1, row_id)
65 |         val, is_hit = request(key, i+1, row_id)
66 |         # convert list of embedding values to tensor 
67 |         ev_tensor = torch.FloatTensor([val]) # val is a python list
68 |         ev_tensor.requires_grad = True
69 |         if (use_gpu):
70 |             # This code assume that we only run this on a single GPU node
71 |             ev_tensor = ev_tensor.to(device)
72 |         arr_emb_weights.append(ev_tensor)
73 |         if is_hit:
74 |             arr_record_hit.append(True)
75 |             agg_hit += 1
76 |         else:
77 |             arr_record_hit.append(False)
78 |     
79 |     return arr_record_hit, arr_emb_weights
80 | 
81 | 


--------------------------------------------------------------------------------
/cython/cython_compile.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Description: compile .so from python code
 7 | 
 8 | from __future__ import absolute_import, division, print_function, unicode_literals
 9 | 
10 | from setuptools import setup
11 | from Cython.Build import cythonize
12 | from distutils.extension import Extension
13 | 
14 | ext_modules = [
15 |     Extension(
16 |         "data_utils_cython",
17 |         ["data_utils_cython.pyx"],
18 |         extra_compile_args=['-O3'],
19 |         extra_link_args=['-O3'],
20 |     )
21 | ]
22 | 
23 | setup(
24 |     name='data_utils_cython',
25 |     ext_modules=cythonize(ext_modules)
26 | )
27 | 


--------------------------------------------------------------------------------
/cython/cython_criteo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Description: run dataset pre-processing in standalone mode
 7 | # WARNING: These steps are required to work with Cython
 8 | # 1. Instal Cython
 9 | # > sudo yum install Cython
10 | # 2. Please copy data_utils.py into data_utils_cython.pyx
11 | # 3. Compile the data_utils_cython.pyx to generate .so
12 | # (it's important to keep extension .pyx rather than .py
13 | #  to ensure the C/C++ .so no .py is loaded at import time)
14 | # > python cython_compile.py build_ext --inplace
15 | # This should create data_utils_cython.so, which can be loaded below with "import"
16 | # 4. Run standalone datatset preprocessing to generate .npz files
17 | # a. Kaggle
18 | # > python cython_criteo.py --data-set=kaggle --raw-data-file=./input/train.txt
19 | #   --processed-data-file=./input/kaggleAdDisplayChallenge_processed.npz
20 | # b. Terabyte
21 | # > python cython_criteo.py --max-ind-range=10000000 [--memory-map] --data-set=terabyte
22 | #   --raw-data-file=./input/day --processed-data-file=./input/terabyte_processed.npz
23 | 
24 | from __future__ import absolute_import, division, print_function, unicode_literals
25 | 
26 | import data_utils_cython as duc
27 | 
28 | if __name__ == "__main__":
29 |     ### import packages ###
30 |     import argparse
31 | 
32 |     ### parse arguments ###
33 |     parser = argparse.ArgumentParser(
34 |         description="Preprocess Criteo dataset"
35 |     )
36 |     # model related parameters
37 |     parser.add_argument("--max-ind-range", type=int, default=-1)
38 |     parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
39 |     parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
40 |     parser.add_argument("--memory-map", action="store_true", default=False)
41 |     parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
42 |     parser.add_argument("--raw-data-file", type=str, default="")
43 |     parser.add_argument("--processed-data-file", type=str, default="")
44 |     args = parser.parse_args()
45 | 
46 |     duc.loadDataset(
47 |         args.data_set,
48 |         args.max_ind_range,
49 |         args.data_sub_sample_rate,
50 |         args.data_randomize,
51 |         "train",
52 |         args.raw_data_file,
53 |         args.processed_data_file,
54 |         args.memory_map
55 |     )
56 | 


--------------------------------------------------------------------------------
/emb_storage/file_read.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import struct
 4 | 
 5 | BINARY_DIR_NAME = "binary/"
 6 | arr_files = []
 7 | TOTAL_BYTE_PER_ROW = -1
 8 | EV_DIMENSION = 36
 9 | 
10 | # Load value as bytes!!
11 | def open_files_as_binary(ev_path_c1, bit_precision = 32):
12 |     global arr_files, TOTAL_BYTE_PER_ROW
13 |     BYTE_PRECISION = int(bit_precision/8)
14 |     TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
15 | 
16 |     print("**************** Opening all Binary EV-files")
17 |     print("**************** from = " + ev_path_c1)
18 |     arr_files.append("ID Zero is not being used!")
19 |     for ev_idx in range(0, 26):
20 |         binFilename = "ev-table-" + str(ev_idx + 1) + ".bin"
21 |         bin_ev_path = os.path.join(ev_path_c1, BINARY_DIR_NAME, binFilename)
22 |         print("************* Opening Binnary EV = " + bin_ev_path)
23 |         arr_files.append(open(bin_ev_path, 'rb'))
24 |     print("**************** All Files are opened!")
25 |     print("**************** TOTAL_BYTE_PER_ROW = " + str(TOTAL_BYTE_PER_ROW))
26 | 
27 | def get(tableId, rowId):
28 |     # tableId started at id = 1
29 |     file = arr_files[tableId]
30 |     # print(TOTAL_BYTE_PER_ROW * rowId )
31 |     file.seek(TOTAL_BYTE_PER_ROW * rowId)
32 |     blob = file.read(TOTAL_BYTE_PER_ROW)
33 |     return struct.unpack('f'*36, blob)
34 | 
35 | def close():
36 |     arr_files.pop(0) # this item0 is not really a file
37 |     for file in arr_files:
38 |         file.close()
39 |     print("**************** All Files are closed!")
40 | 


--------------------------------------------------------------------------------
/emb_storage/mmap_file_read.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import struct
 4 | import mmap
 5 | 
 6 | BINARY_DIR_NAME = "binary/"
 7 | arr_files = []
 8 | arr_mmap_files = []
 9 | TOTAL_BYTE_PER_ROW = -1
10 | EV_DIMENSION = 36
11 | 
12 | # Load value as bytes!!
13 | def open_files_as_binary(ev_path_c1, bit_precision = 32):
14 |     global arr_files, TOTAL_BYTE_PER_ROW
15 |     BYTE_PRECISION = int(bit_precision/8)
16 |     TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
17 | 
18 |     print("**************** Opening all Binary EV-files")
19 |     print("**************** from = " + ev_path_c1)
20 |     arr_files.append("ID Zero is not being used!")
21 |     arr_mmap_files.append("ID Zero is not being used!")
22 |     for ev_idx in range(0, 26):
23 |         binFilename = "ev-table-" + str(ev_idx + 1) + ".bin"
24 |         bin_ev_path = os.path.join(ev_path_c1, BINARY_DIR_NAME, binFilename)
25 |         print("************* Opening Binnary EV = " + bin_ev_path)
26 |         file = open(bin_ev_path, 'rb')
27 |         arr_files.append(file)
28 |         arr_mmap_files.append(mmap.mmap(file.fileno(), 0, prot=mmap.PROT_READ))
29 |     print("**************** All Files are opened!")
30 |     print("**************** TOTAL_BYTE_PER_ROW = " + str(TOTAL_BYTE_PER_ROW))
31 | 
32 | def get(tableId, rowId):
33 |     # tableId started at id = 1
34 |     # file = arr_files[tableId]
35 |     file = arr_mmap_files[tableId]
36 |     # print(TOTAL_BYTE_PER_ROW * rowId )
37 |     file.seek(TOTAL_BYTE_PER_ROW * rowId)
38 |     blob = file.read(TOTAL_BYTE_PER_ROW)
39 |     # return struct.unpack('f'*36, blob[0:TOTAL_BYTE_PER_ROW])
40 |     return struct.unpack('f'*36, blob)
41 | 
42 | def close():
43 |     arr_files.pop(0) # this item0 is not really a file
44 |     for file in arr_files:
45 |         file.close()
46 |     print("**************** All Files are closed!")
47 | 


--------------------------------------------------------------------------------
/emb_storage/multi_storage_dummy/socket-server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import socket
  4 | import pandas as pd
  5 | import os
  6 | import torch
  7 | import struct
  8 | 
  9 | parser = argparse.ArgumentParser(description="EvLFU server")
 10 | parser.add_argument("--port", type=int, default=8000)
 11 | parser.add_argument("--ev-path", type=str, default="")
 12 | args = parser.parse_args()
 13 | 
 14 | # Call EvLFU service through socket
 15 | HOST = '127.0.0.1'  # Standard loopback interface address (localhost)
 16 | PORT = args.port  # 65432        # Port to listen on (non-privileged ports are > 1023)
 17 | MAX_BUFFER = 1024
 18 | BINARY_DIR_NAME = "binary/"
 19 | TOTAL_EV_TABLE = 26
 20 | EV_DIMENSION = 36
 21 | 
 22 | # This is ROCKSDB client or dummyMemStor client
 23 | 
 24 | EvTable_C1 = []
 25 | 
 26 | # Load value as bytes!!
 27 | def load(ev_path_c1, bit_precision = 32):
 28 |     # We are still storing it as array of floats. TODO: Store it as binary!
 29 |     print("**************** Loading EV Table to DummyMemoryStorage")
 30 |     print("**************** Load new set of EV Table from = " + ev_path_c1)
 31 |     global EvTable_C1
 32 |     EvTable_C1.append("Buffer: table0 is not used")
 33 |     BYTE_PRECISION = int(bit_precision/8)
 34 |     TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
 35 | 
 36 |     for ev_idx in range(0, 26):
 37 |         binFilename = "ev-table-" + str(ev_idx + 1) + ".bin"
 38 |         bin_ev_path = os.path.join(ev_path_c1, BINARY_DIR_NAME, binFilename)
 39 |         print("************* Loading Binnary EV = " + bin_ev_path)
 40 | 
 41 |         curr_table = []
 42 |         # put
 43 |         with open(bin_ev_path, 'rb') as f:
 44 |             data = f.read()
 45 |             num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
 46 |             for i in range(0, num_of_indexes):
 47 |                 # put
 48 |                 byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
 49 |                 blob = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
 50 |                 curr_table.append(blob)
 51 | 
 52 |                 # Try reading the blob 
 53 |                 # print(struct.unpack('f'*36, blob[0:144]))
 54 |         f.close()
 55 |         EvTable_C1.append(curr_table)
 56 |     print("**************** All EvTable loaded in the Memory!")
 57 | 
 58 | def load_as_list(ev_path_c1):
 59 |     # We are still storing it as array of floats. TODO: Store it as binary!
 60 |     print("**************** Loading EV Table to DummyMemoryStorage")
 61 |     print("**************** Load new set of EV Table from = " + ev_path_c1)
 62 |     global EvTable_C1
 63 |     EvTable_C1.append("Buffer: table0 is not used")
 64 |     for ev_idx in range(0, 26):
 65 |         # Read new EV Table from file
 66 |         ev_path = os.path.join(ev_path_c1,
 67 |                                    "ev-table-" + str(ev_idx + 1) + ".csv")
 68 |         print("********************* Loading EV = " + ev_path)
 69 |         new_ev_df = pd.read_csv(ev_path, dtype=float, delimiter=',')
 70 |         # Convert to numpy first before to tensor
 71 |         new_ev_arr = new_ev_df.to_numpy()
 72 |         # Convert to tensor
 73 |         # Option 1: Store it as numpy array (Slower for reading)
 74 |         # EvTable_C1[ev_idx + 1] = new_ev_arr
 75 |         # Option 2: Store it as pure python list
 76 |         EvTable_C1.append(new_ev_arr.tolist())
 77 |         break
 78 |     print("**************** All EvTable loaded in the Memory!")
 79 | 
 80 | def get(tableId, rowId):
 81 |     # tableId started at id = 1
 82 |     global EvTable_C1
 83 |     return EvTable_C1[tableId][rowId]
 84 | 
 85 | def get_many(arrTableId, arrRowId):
 86 |     # tableId started at id = 1
 87 |     global EvTable_C1
 88 |     arrVal = []
 89 |     for i in range(len(arrTableId)):
 90 |         arrVal.append(EvTable_C1[arrTableId[i]][arrRowId[i]])
 91 |     # return array of values
 92 |     return arrVal
 93 | 
 94 | def listen():
 95 |     print("This server is ready to look up the ev-value based on the key!")
 96 |     print("Start listening at port: " + str(args.port))
 97 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 98 |         s.bind((HOST, PORT))
 99 |         s.listen()
100 |         conn, addr = s.accept()
101 |         with conn:
102 |             print('Connected to client at: ', addr)
103 |             while True:
104 |                 buf = conn.recv(MAX_BUFFER)
105 |                 if buf:
106 | 
107 |                     keys = str(buf, 'utf8').split('\n')
108 |                     # print("keys: " + str(keys))
109 |                     for key in keys:
110 |                         tableId, rowId = key.split('-', 2)
111 |                         val = get(int(tableId), int(rowId))
112 |                         conn.sendall(val)
113 |                     # print("Done sending the values of " + str(keys))
114 | 
115 |                     # tableId, rowId = str(buf, 'utf8').split('-', 2)
116 |                     # val = get(int(tableId), int(rowId))
117 |                     # print(val)
118 |                     # print(struct.unpack('f'*36, val[0:144]))
119 |                     # conn.sendall(val)
120 | 
121 | if __name__=="__main__":
122 |     load(args.ev_path)
123 |     listen()
124 | 


--------------------------------------------------------------------------------
/emb_storage/storage_dummy.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import torch
 4 | import struct
 5 | import sys
 6 | sys.path.append('../../')
 7 | 
 8 | import evstore_utils
 9 | import storage_manager
10 | 
11 | EvTable_C1 = []
12 | MAX_BUFFER = 256
13 | BINARY_DIR_NAME = "binary/"
14 | TOTAL_EV_TABLE = 26
15 | EV_DIMENSION = 36
16 | 
17 | 
18 | def load(ev_path_c1):
19 |     # return load_as_binary(ev_path_c1)
20 |     return load_as_list(ev_path_c1)
21 | 
22 | def get(tableId, rowId):
23 |     # tableId started at id = 1
24 |     # return get_as_binary(tableId, rowId)
25 |     return get_as_list(tableId, rowId)
26 | 
27 | def get_nrows_pertable(file_path):
28 |     _, _, _, ln_emb, _ = evstore_utils.read_training_config(file_path)
29 |     return ln_emb
30 | 
31 | # Load value as bytes!!
32 | def load_as_binary(ev_path_c1, bit_precision = 32):
33 |     print("**************** Loading EV Table to DummyMemoryStorage")
34 |     print("**************** Load new set of EV Table from = " + ev_path_c1)
35 |     global EvTable_C1
36 |     EvTable_C1.append("Buffer: table0 is not used")
37 |     BYTE_PRECISION = int(bit_precision/8)
38 |     TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
39 |     ln_emb = get_nrows_pertable(storage_manager.training_config_path)
40 | 
41 |     for ev_idx in range(0, 26):
42 |         binFilename = "ev-table-" + str(ev_idx + 1) + ".bin"
43 |         bin_ev_path = os.path.join(ev_path_c1, BINARY_DIR_NAME, binFilename)
44 |         print("************* Loading Binnary EV = " + bin_ev_path)
45 | 
46 |         curr_table = []
47 |         # put
48 |         with open(bin_ev_path, 'rb') as f:
49 |             data = f.read()
50 |             num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
51 |             assert(ln_emb[ev_idx] == num_of_indexes)
52 |             for i in range(0, num_of_indexes):
53 |                 # put
54 |                 byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
55 |                 blob = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
56 |                 curr_table.append(blob)
57 | 
58 |                 # Try reading the blob 
59 |                 # print(struct.unpack('f'*36, blob[0:144]))
60 |         f.close()
61 |         EvTable_C1.append(curr_table)
62 |     print("**************** All EvTable loaded in the Memory!")
63 | 
64 | def get_as_binary(tableId, rowId):
65 |     # tableId started at id = 1
66 |     global EvTable_C1
67 |     blob = EvTable_C1[tableId][rowId]
68 |     return struct.unpack('f'*36, blob)
69 | 
70 | def load_as_list(ev_path_c1):
71 |     print("**************** Loading EV Table to DummyMemoryStorage")
72 |     print("**************** Load new set of EV Table from = " + ev_path_c1)
73 |     global EvTable_C1
74 |     EvTable_C1.append("Buffer: table0 is not used")
75 |     for ev_idx in range(0, 26):
76 |         # Read new EV Table from file
77 |         ev_path = os.path.join(ev_path_c1,
78 |                                    "ev-table-" + str(ev_idx + 1) + ".csv")
79 |         print("********************* Loading EV = " + ev_path)
80 |         new_ev_df = pd.read_csv(ev_path, dtype=float, delimiter=',')
81 |         # Convert to numpy first before to tensor
82 |         new_ev_arr = new_ev_df.to_numpy()
83 |         # Convert to tensor
84 |         # Option 1: Store it as numpy array (Slower for reading)
85 |         # EvTable_C1[ev_idx + 1] = new_ev_arr
86 |         # Option 2: Store it as pure python list
87 |         EvTable_C1.append(new_ev_arr.tolist())
88 |     print("**************** All EvTable loaded in the Memory!")
89 | 
90 | def get_as_list(tableId, rowId):
91 |     # tableId started at id = 1
92 |     global EvTable_C1
93 |     # print(EvTable_C1[tableId][rowId])
94 |     # exit()
95 |     return EvTable_C1[tableId][rowId]
96 | 


--------------------------------------------------------------------------------
/emb_storage/storage_rocksdb.py:
--------------------------------------------------------------------------------
  1 | import pyrocksdb
  2 | import time
  3 | import os
  4 | import pandas as pd
  5 | import argparse
  6 | import struct
  7 | import numpy as np
  8 | from array import *
  9 | from tqdm import tqdm
 10 | import torch
 11 | import shutil
 12 | from pathlib import Path
 13 | import sys
 14 | sys.path.append('../../')
 15 | 
 16 | import evstore_utils
 17 | import storage_manager
 18 | 
 19 | ROCKSDB_DB_DIR = "/mnt/extra/db-ev-storage/rocksdb/"
 20 | BINARY_DIR_NAME = "binary/"
 21 | TOTAL_EV_TABLE = 26
 22 | EV_DIMENSION = 36
 23 | 
 24 | class RocksDBClient:
 25 | 
 26 |     # will read the BINARY values from the rocksdb
 27 |     def get(self, tableId, rowId):
 28 |         # TableId start from index 1
 29 |         # assert(tableId >= 1)
 30 |         # assert(tableId <= TOTAL_EV_TABLE)
 31 |         # tableId started at 1, but the db connection started at id 0
 32 |         blob = self.db_conn.get(self.read_opts, str(tableId) + "-" + str(rowId))
 33 |         # convert to float list
 34 |         # return struct.unpack('f'*EV_DIMENSION, blob.data[0:144])
 35 |         return struct.unpack('f'*EV_DIMENSION, blob.data)
 36 |     
 37 |     # will read the BINARY values from the rocksdb
 38 |     def getByKey(self, key):
 39 |         # TableId start from index 1
 40 |         # tableId started at 1, but the db connection started at id 0
 41 |         blob = self.db_conn.get(self.read_opts, key)
 42 |         # convert to float list
 43 |         print(blob)
 44 |         val = struct.unpack('f'*EV_DIMENSION, blob.data)
 45 |         print(val)
 46 |         exit()
 47 |         return val
 48 | 
 49 |     def open_db_conn(self):
 50 |         print("Will prepare db connection")
 51 |         opts = pyrocksdb.Options()
 52 |         # for multi-thread
 53 |         opts.IncreaseParallelism()
 54 |         opts.OptimizeLevelStyleCompaction()
 55 |         self.db_conn = pyrocksdb.DB()
 56 |         status = self.db_conn.open(opts, os.path.join(ROCKSDB_DB_DIR, "ev-table-all.db"))
 57 |         assert(status.ok())
 58 |         print("All db connections are ready!")
 59 | 
 60 |     def close_db_conn(self):
 61 |         print("Closing rocksdb connections")
 62 |         self.db_conn.close()
 63 | 
 64 |     def get_nrows_pertable(self, file_path):
 65 |         _, _, _, ln_emb, _ = evstore_utils.read_training_config(file_path)
 66 |         return ln_emb
 67 | 
 68 |     def load(self, ev_dir, bit_precision = 32):
 69 |         # delete the db dir if exists
 70 |         if os.path.exists(ROCKSDB_DB_DIR) and os.path.isdir(ROCKSDB_DB_DIR):
 71 |             shutil.rmtree(ROCKSDB_DB_DIR)
 72 |         # recreate the dir to hold new rocksdb data
 73 |         Path(os.path.join(ROCKSDB_DB_DIR)).mkdir(parents=True, exist_ok=True)
 74 | 
 75 |         print("**************** Loading EV Table to ROCKSDB")
 76 |         print("**************** Load new set of EV Table from = " + ev_dir)
 77 | 
 78 |         assert(bit_precision%4 == 0)
 79 |         ln_emb = self.get_nrows_pertable(storage_manager.training_config_path)
 80 | 
 81 |         BYTE_PRECISION = int(bit_precision/8)
 82 |         TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
 83 | 
 84 |         db = pyrocksdb.DB()
 85 |         opts = pyrocksdb.Options()
 86 |         # for multi-thread
 87 |         opts.IncreaseParallelism()
 88 |         opts.OptimizeLevelStyleCompaction()
 89 |         opts.create_if_missing = True
 90 |         db_filename = "ev-table-all.db" 
 91 |         db_filename = os.path.join(ROCKSDB_DB_DIR, db_filename)
 92 |         #print(db_filename)
 93 |         s = db.open(opts, db_filename)
 94 |         assert(s.ok())
 95 |         
 96 |         # Storing binary ev-tables to rocksDB
 97 |         for ev_idx in range(0, TOTAL_EV_TABLE):
 98 |             bin_filename = "ev-table-" + str(ev_idx + 1) + ".bin"
 99 | 
100 |             # RocksDB loads the BINARY EV-Tables!
101 |             bin_ev_path = os.path.join(ev_dir, BINARY_DIR_NAME, bin_filename)
102 |             print("************* Loading EV = " + bin_ev_path)
103 | 
104 |             # put
105 |             with open(bin_ev_path, 'rb') as f:
106 |                 data = f.read()
107 |                 num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
108 | 
109 |                 # Verify that the number of unique values per table is the same as what the DLRM model expect
110 |                 assert(ln_emb[ev_idx] == num_of_indexes)
111 | 
112 |                 opts = pyrocksdb.WriteOptions()
113 |                 #for nrow in tqdm(range(0, num_of_indexes)):
114 |                 for i in range(0, num_of_indexes):
115 |                     # put
116 |                     byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
117 |                     v = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
118 |                     k = str(ev_idx+1) + "-" + str(i)
119 |                     db.put(opts, k, v)
120 |                 print("              === db-path: " + db_filename)
121 |             f.close()
122 |         print("**************** All EvTable loaded in the RocksDB!")
123 |         db.close()
124 |     
125 |     def __init__(self):
126 |         self.db_conn = None
127 |         self.read_opts = pyrocksdb.ReadOptions()
128 | 
129 | 


--------------------------------------------------------------------------------
/emb_storage/storage_rocksdb_26_tabs.py:
--------------------------------------------------------------------------------
  1 | import pyrocksdb
  2 | import time
  3 | import os
  4 | import pandas as pd
  5 | import argparse
  6 | import struct
  7 | import numpy as np
  8 | from array import *
  9 | from tqdm import tqdm
 10 | import struct
 11 | import torch
 12 | import shutil
 13 | from pathlib import Path
 14 | import sys
 15 | sys.path.append('../../')
 16 | 
 17 | import evstore_utils
 18 | import storage_manager
 19 | 
 20 | ROCKSDB_DB_PATH = "/mnt/extra/db-ev-storage/rocksdb/"
 21 | BINARY_DIR_NAME = "binary/"
 22 | TOTAL_EV_TABLE = 26
 23 | EV_DIMENSION = 36
 24 | 
 25 | class RocksDBClient:
 26 | 
 27 |     # will read the BINARY values from the rocksdb
 28 |     def get(self, tableId, rowId):
 29 |         opts = pyrocksdb.ReadOptions()
 30 |         # assert(tableId >= 1)
 31 |         # assert(tableId <= TOTAL_EV_TABLE)
 32 |         # tableId started at 1, but the db connection started at id 0
 33 |         blob = self.arr_db_conn[tableId - 1].get(self.read_opts, str(rowId))
 34 |         # convert to float list
 35 |         return struct.unpack('f'*36, blob.data[0:144])
 36 | 
 37 |     def open_db_conn(self):
 38 |         print("Will prepare db connection")
 39 |         opts = pyrocksdb.Options()
 40 |         # for multi-thread
 41 |         opts.IncreaseParallelism()
 42 |         opts.OptimizeLevelStyleCompaction()
 43 |         for i in range(TOTAL_EV_TABLE):
 44 |             db_conn = pyrocksdb.DB()
 45 |             status = db_conn.open(opts, os.path.join(ROCKSDB_DB_PATH, "ev-table-" + str(i+1) + ".db"))
 46 |             assert(status.ok())
 47 |             self.arr_db_conn.append(db_conn)
 48 |         print("All db connections are ready!")
 49 | 
 50 |     def close_db_conn(self):
 51 |         print("Closing rocksdb connections")
 52 |         for db_conn in self.arr_db_conn:
 53 |             db_conn.close()
 54 | 
 55 |     def get_nrows_pertable(self, file_path):
 56 |         _, _, _, ln_emb, _ = evstore_utils.read_training_config(file_path)
 57 |         return ln_emb
 58 | 
 59 |     def load(self, ev_dir, bit_precision = 32):
 60 |         # delete the db dir if exists
 61 |         if os.path.exists(ROCKSDB_DB_PATH) and os.path.isdir(ROCKSDB_DB_PATH):
 62 |             shutil.rmtree(ROCKSDB_DB_PATH)
 63 |         # recreate the dir to hold new rocksdb data
 64 |         Path(os.path.join(ROCKSDB_DB_PATH)).mkdir(parents=True, exist_ok=True)
 65 | 
 66 |         print("**************** Loading EV Table to ROCKSDB")
 67 |         print("**************** Load new set of EV Table from = " + ev_dir)
 68 | 
 69 |         assert(bit_precision%4 == 0)
 70 |         ln_emb = self.get_nrows_pertable(storage_manager.training_config_path)
 71 | 
 72 |         BYTE_PRECISION = int(bit_precision/8)
 73 |         TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
 74 |         # Storing binary ev-tables to rocksDB
 75 |         for ev_idx in range(0, TOTAL_EV_TABLE):
 76 |             binFilename = "ev-table-" + str(ev_idx + 1) + ".bin"
 77 | 
 78 |             # RocksDB loads the BINARY EV-Tables!
 79 |             bin_ev_path = os.path.join(ev_dir, BINARY_DIR_NAME, binFilename)
 80 |             print("************* Loading EV = " + bin_ev_path)
 81 | 
 82 |             db = pyrocksdb.DB()
 83 |             opts = pyrocksdb.Options()
 84 |             # for multi-thread
 85 |             opts.IncreaseParallelism()
 86 |             opts.OptimizeLevelStyleCompaction()
 87 |             opts.create_if_missing = True
 88 |             dbFilename = "ev-table-" + str(ev_idx + 1) + ".db" 
 89 |             dbFilename = os.path.join(ROCKSDB_DB_PATH, dbFilename)
 90 |             #print(dbFilename)
 91 |             s = db.open(opts, dbFilename)
 92 |             assert(s.ok())
 93 |             # put
 94 |             with open(bin_ev_path, 'rb') as f:
 95 |                 data = f.read()
 96 |                 num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
 97 | 
 98 |                 # Verify that the number of unique values per table is the same as what the DLRM model expect
 99 |                 assert(ln_emb[ev_idx] == num_of_indexes)
100 | 
101 |                 opts = pyrocksdb.WriteOptions()
102 |                 #for nrow in tqdm(range(0, num_of_indexes)):
103 |                 for i in range(0, num_of_indexes):
104 |                     # put
105 |                     byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
106 |                     v = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
107 |                     k = str(i)
108 |                     db.put(opts, k, v)
109 |                 db.close()
110 |                 print("              === db-path: " + dbFilename)
111 |             f.close()
112 |         print("**************** All EvTable loaded in the RocksDB!")
113 |     
114 |     def __init__(self):
115 |         self.arr_db_conn = []
116 |         self.read_opts = pyrocksdb.ReadOptions()
117 | 
118 | 


--------------------------------------------------------------------------------
/emb_storage/storage_sqlite.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | import os
  4 | import pandas as pd
  5 | import argparse
  6 | import struct
  7 | import numpy as np
  8 | from array import *
  9 | from tqdm import tqdm
 10 | import torch
 11 | import shutil
 12 | from pathlib import Path
 13 | import sys
 14 | sys.path.append('../../')
 15 | 
 16 | import evstore_utils
 17 | import storage_manager
 18 | 
 19 | SQLITE_DB_DIR = "/mnt/extra/db-ev-storage/sqlite/"
 20 | BINARY_DIR_NAME = "binary/"
 21 | TOTAL_EV_TABLE = 26
 22 | EV_DIMENSION = 36
 23 | DB_NAME = "ev-table-all.db"
 24 | 
 25 | class SQLiteClient:
 26 | 
 27 |     # will read the BINARY values from the SQLiteDB
 28 |     def get(self, tableId, rowId):
 29 |         # TableId start from index 1
 30 |         # assert(tableId >= 1)
 31 |         # assert(tableId <= TOTAL_EV_TABLE)
 32 |         # The row at SQLite is started from 1 instead of 0
 33 |         realRowId = rowId + 1 + self.db_add_up_tables[tableId-1]
 34 |         blob = self.db_cursor.execute("SELECT * FROM tab1 where rowid={};".format(realRowId)).fetchone()
 35 |         # print(tableId)
 36 |         # print(rowId)
 37 |         # print(blob)
 38 |         # assert(blob != None)
 39 |         return struct.unpack('f'*EV_DIMENSION, blob[0])
 40 | 
 41 |     def get_nrows_pertable(self, file_path):
 42 |         _, _, _, ln_emb, _ = evstore_utils.read_training_config(file_path)
 43 |         return ln_emb
 44 | 
 45 |     def load(self, ev_dir, bit_precision = 32):
 46 |         # delete the db dir if exists
 47 |         if os.path.exists(SQLITE_DB_DIR) and os.path.isdir(SQLITE_DB_DIR):
 48 |             shutil.rmtree(SQLITE_DB_DIR)
 49 |         # recreate the dir to hold new sqlite data
 50 |         Path(os.path.join(SQLITE_DB_DIR)).mkdir(parents=True, exist_ok=True)
 51 |         db = sqlite3.connect(self.db_file_path)
 52 |         db_cursor = db.cursor()
 53 | 
 54 |         print("**************** Loading EV Table to SQLite")
 55 |         print("**************** Load new set of EV Table from = " + ev_dir)
 56 | 
 57 |         assert(bit_precision%4 == 0)
 58 |         ln_emb = self.get_nrows_pertable(storage_manager.training_config_path)
 59 | 
 60 |         BYTE_PRECISION = int(bit_precision/8)
 61 |         TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
 62 |         table_name = "tab1"
 63 |         # Storing binary ev-tables to SQLite
 64 |         for ev_idx in range(0, TOTAL_EV_TABLE):
 65 |             bin_filename = "ev-table-" + str(ev_idx + 1) + ".bin"
 66 |             # table_name = "ev_table_" + str(ev_idx + 1)
 67 | 
 68 |             db_cursor.execute("CREATE TABLE if not exists " + table_name + " (b BLOB);")
 69 | 
 70 |             # SQLite loads the BINARY EV-Tables!
 71 |             bin_ev_path = os.path.join(ev_dir, BINARY_DIR_NAME, bin_filename)
 72 |             print("************* Loading EV = " + bin_ev_path)
 73 |             # put
 74 |             with open(bin_ev_path, 'rb') as f:
 75 |                 data = f.read()
 76 |                 num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
 77 | 
 78 |                 # Verify that the number of unique values per table is the same as what the DLRM model expect
 79 |                 assert(ln_emb[ev_idx] == num_of_indexes)
 80 |                 
 81 |                 bin_ev_path = "/home/cc/ev-tables-sqlite/bin_workload"
 82 | 
 83 |                 #for nrow in tqdm(range(0, num_of_indexes)):
 84 |                 for i in range(0, num_of_indexes):
 85 |                     # put
 86 |                     byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
 87 |                     v = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
 88 |                     k = str(ev_idx+1) + "-" + str(i)
 89 |                     db_cursor.execute("insert into " + table_name + " values(?)", (v, ))
 90 |                 print("              === db-path: " + table_name)
 91 |             f.close()
 92 |         print("**************** All EvTable loaded in the SQLite!")
 93 |         db.commit()
 94 |         db.close()
 95 | 
 96 |     def open_db_conn(self):
 97 |         print("Will prepare db connection")
 98 |         self.db_conn = sqlite3.connect(self.db_file_path)
 99 |         self.db_cursor = self.db_conn.cursor()
100 |         print("All db connections are ready!")
101 | 
102 |     def close_db_conn(self):
103 |         print("Closing sqlite connections")
104 |         self.db_conn.close()
105 | 
106 |     def __init__(self):
107 |         self.db_conn = None
108 |         self.db_cursor = None
109 |         self.db_file_path = os.path.join(SQLITE_DB_DIR, DB_NAME)
110 |         self.db_ln_tables = self.get_nrows_pertable(storage_manager.training_config_path)
111 |         self.db_add_up_tables = [0 for _ in range(len(self.db_ln_tables))]
112 |         for i in range(len(self.db_ln_tables)-1):
113 |             self.db_add_up_tables[i+1] = self.db_add_up_tables[i] + self.db_ln_tables[i]
114 | 


--------------------------------------------------------------------------------
/emb_storage/storage_sqlite_26_tabs.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import time
  3 | import os
  4 | import pandas as pd
  5 | import argparse
  6 | import struct
  7 | import numpy as np
  8 | from array import *
  9 | from tqdm import tqdm
 10 | import torch
 11 | import shutil
 12 | from pathlib import Path
 13 | import sys
 14 | sys.path.append('../../')
 15 | 
 16 | import evstore_utils
 17 | import storage_manager
 18 | 
 19 | SQLITE_DB_DIR = "/mnt/extra/db-ev-storage/sqlite/"
 20 | BINARY_DIR_NAME = "binary/"
 21 | TOTAL_EV_TABLE = 26
 22 | EV_DIMENSION = 36
 23 | DB_NAME = "ev-table-all.db"
 24 | 
 25 | class SQLiteClient:
 26 | 
 27 |     # will read the BINARY values from the sqlite
 28 |     def get(self, tableId, rowId):
 29 |         # TableId start from index 1
 30 |         # assert(tableId >= 1)
 31 |         # assert(tableId <= TOTAL_EV_TABLE)
 32 |         # The row at SQLite is started from 1 instead of 0
 33 |         blob = self.db_cursor.execute("SELECT * FROM ev_table_{} where rowid={};".format(tableId, rowId + 1)).fetchone()
 34 |         # print(tableId)
 35 |         # print(rowId)
 36 |         # print(blob)
 37 |         # assert(blob != None)
 38 |         return struct.unpack('f'*EV_DIMENSION, blob[0])
 39 | 
 40 |     def get_nrows_pertable(self, file_path):
 41 |         _, _, _, ln_emb, _ = evstore_utils.read_training_config(file_path)
 42 |         return ln_emb
 43 | 
 44 |     def load(self, ev_dir, bit_precision = 32):
 45 |         # delete the db dir if exists
 46 |         if os.path.exists(SQLITE_DB_DIR) and os.path.isdir(SQLITE_DB_DIR):
 47 |             shutil.rmtree(SQLITE_DB_DIR)
 48 |         # recreate the dir to hold new sqlite data
 49 |         Path(os.path.join(SQLITE_DB_DIR)).mkdir(parents=True, exist_ok=True)
 50 |         db = sqlite3.connect(self.db_file_path)
 51 |         db_cursor = db.cursor()
 52 | 
 53 |         print("**************** Loading EV Table to SQLite")
 54 |         print("**************** Load new set of EV Table from = " + ev_dir)
 55 | 
 56 |         assert(bit_precision%4 == 0)
 57 |         ln_emb = self.get_nrows_pertable(storage_manager.training_config_path)
 58 | 
 59 |         BYTE_PRECISION = int(bit_precision/8)
 60 |         TOTAL_BYTE_PER_ROW = EV_DIMENSION * BYTE_PRECISION
 61 | 
 62 |         # Storing binary ev-tables to SQLite
 63 |         for ev_idx in range(0, TOTAL_EV_TABLE):
 64 |             bin_filename = "ev-table-" + str(ev_idx + 1) + ".bin"
 65 |             table_name = "ev_table_" + str(ev_idx + 1)
 66 | 
 67 |             db_cursor.execute("CREATE TABLE if not exists " + table_name + " (b BLOB);")
 68 | 
 69 |             # SQLite loads the BINARY EV-Tables!
 70 |             bin_ev_path = os.path.join(ev_dir, BINARY_DIR_NAME, bin_filename)
 71 |             print("************* Loading EV = " + bin_ev_path)
 72 |             # put
 73 |             with open(bin_ev_path, 'rb') as f:
 74 |                 data = f.read()
 75 |                 num_of_indexes = len(data) // TOTAL_BYTE_PER_ROW
 76 | 
 77 |                 # Verify that the number of unique values per table is the same as what the DLRM model expect
 78 |                 assert(ln_emb[ev_idx] == num_of_indexes)
 79 |                 
 80 |                 bin_ev_path = "/home/cc/ev-tables-sqlite/bin_workload"
 81 | 
 82 |                 #for nrow in tqdm(range(0, num_of_indexes)):
 83 |                 for i in range(0, num_of_indexes):
 84 |                     # put
 85 |                     byte_offset = BYTE_PRECISION * i * EV_DIMENSION # 36 -> dimension
 86 |                     v = data[ byte_offset : byte_offset + TOTAL_BYTE_PER_ROW]
 87 |                     k = str(ev_idx+1) + "-" + str(i)
 88 |                     db_cursor.execute("insert into " + table_name + " values(?)", (v, ))
 89 |                 print("              === db-path: " + table_name)
 90 |             f.close()
 91 |         print("**************** All EvTable loaded in the SQLite!")
 92 |         db.commit()
 93 |         db.close()
 94 | 
 95 |     def open_db_conn(self):
 96 |         print("Will prepare db connection")
 97 |         self.db_conn = sqlite3.connect(self.db_file_path)
 98 |         self.db_cursor = self.db_conn.cursor()
 99 |         print("All db connections are ready!")
100 | 
101 |     def close_db_conn(self):
102 |         print("Closing sqlite connections")
103 |         self.db_conn.close()
104 | 
105 |     def __init__(self):
106 |         self.db_conn = None
107 |         self.db_cursor = None
108 |         self.db_file_path = os.path.join(SQLITE_DB_DIR, DB_NAME)
109 | 


--------------------------------------------------------------------------------
/evstore_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | # import EvLFU
 4 | import struct
 5 | from pathlib import Path
 6 | import ast
 7 | import numpy as np
 8 | import torch
 9 | 
10 | TRAINING_CONFIG_FILE = "training_config.txt"
11 | 
12 | # Replacing the current embedding layer at the current model
13 | def load_new_ev_table (ld_model, ev_path):
14 |     print("Load new set of EV Table from = " + ev_path)
15 |     for ev_idx in range(0, 26):
16 |         new_ev_path = os.path.join(ev_path, "ev-table-"+ str(ev_idx + 1) + ".csv")
17 |         new_ev_df = pd.read_csv(new_ev_path, dtype=float, delimiter=',')
18 |         # # Convert to numpy first before to tensor
19 |         new_ev_arr = new_ev_df.to_numpy()
20 |         # # Convert to tensor
21 |         new_ev_tensor = torch.FloatTensor(new_ev_arr)
22 | 
23 |         print("Loading NEW EV per embedding layer = " + new_ev_path)
24 | 
25 |         # Create key since the entire model will be accessed
26 |         key = str("emb_l."+str(ev_idx)+".weight")
27 |         # Replace the current embedding tensor with the new one based on the key
28 |         ld_model["state_dict"][key] = new_ev_tensor
29 |     print("Done loading all EV-Table from " + ev_path)
30 | 
31 | def store_training_config(file_path, table_feature_map, nbatches, nbatches_test, ln_emb, m_den):
32 |     # store the config to a file to avoid redoing the computation during inference-only
33 |     with open(file_path, 'w') as f:
34 |         f.write('The order of the arguments: table_feature_map, nbatches, nbatches_test, ln_emb, m_den\n')
35 |         f.write(str(table_feature_map)+"\n")
36 |         f.write(str(nbatches)+"\n")
37 |         f.write(str(nbatches_test)+"\n")
38 |         f.write(str(ln_emb.tolist())+"\n")
39 |         f.write(str(m_den)+"\n")
40 |     print("Done writing training config to : " + file_path + "\n")
41 | 
42 | def read_training_config(file_path):
43 |     print("Read training config from : " + file_path)
44 |     with open(file_path) as f:
45 |         lines = [line.rstrip() for line in f]
46 | 
47 |     table_feature_map = ast.literal_eval(lines[1])
48 |     nbatches = int(lines[2])
49 |     nbatches_test = int(lines[3])
50 |     ln_emb = np.array(ast.literal_eval(lines[4]))
51 |     m_den = int(lines[5])
52 |     return table_feature_map, nbatches, nbatches_test, ln_emb, m_den
53 | 
54 | def prepare_inference_trace_folder(input_data_name, percent_data_for_inference):
55 |     print("Create folder to store the model and ev-tables")
56 |     outdir = os.path.join("logs", "inf-workload-traces", input_data_name, "inference=" + str(percent_data_for_inference))
57 |     Path(outdir).mkdir(parents=True, exist_ok=True)
58 |     return outdir
59 | 
60 | def write_inf_workload_to_file(workload_traces_outdir, arr_inference_workload):
61 |     # Create + open 26 different files
62 |     print("Total inference = " + str(len(arr_inference_workload)))
63 |     arrfile = []
64 |     
65 |     for idx in range(0,26):
66 |         arrfile.append(open(workload_traces_outdir + "/workload-group-" + str(idx + 1) + ".csv",'w'))
67 |         arrfile[idx].write("G" + str(idx + 1) + "_key\n")
68 | 
69 |     for grouped_keys in arr_inference_workload:
70 |         id = 0
71 |         for key in grouped_keys:
72 |             arrfile[id].write(key + "\n")
73 |             id += 1


--------------------------------------------------------------------------------
/input/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | #!compressed4git*
3 | !.gitignore
4 | !readme.txt
5 | #!*tar.gz.part*
6 | 


--------------------------------------------------------------------------------
/input/readme.txt:
--------------------------------------------------------------------------------
 1 |         ------ Display Advertising Challenge ------
 2 | 
 3 | Dataset: dac-v1
 4 | 
 5 | This dataset contains feature values and click feedback for millions of display 
 6 | ads. Its purpose is to benchmark algorithms for clickthrough rate (CTR) prediction.
 7 | It has been used for the Display Advertising Challenge hosted by Kaggle:
 8 | https://www.kaggle.com/c/criteo-display-ad-challenge/
 9 | 
10 | ===================================================
11 | 
12 | Full description:
13 | 
14 | This dataset contains 2 files:
15 |   train.txt
16 |   test.txt
17 | corresponding to the training and test parts of the data. 
18 | 
19 | ====================================================
20 | 
21 | Dataset construction:
22 | 
23 | The training dataset consists of a portion of Criteo's traffic over a period
24 | of 7 days. Each row corresponds to a display ad served by Criteo and the first
25 | column is indicates whether this ad has been clicked or not.
26 | The positive (clicked) and negatives (non-clicked) examples have both been
27 | subsampled (but at different rates) in order to reduce the dataset size.
28 | 
29 | There are 13 features taking integer values (mostly count features) and 26
30 | categorical features. The values of the categorical features have been hashed
31 | onto 32 bits for anonymization purposes. 
32 | The semantic of these features is undisclosed. Some features may have missing values.
33 | 
34 | The rows are chronologically ordered.
35 | 
36 | The test set is computed in the same way as the training set but it 
37 | corresponds to events on the day following the training period. 
38 | The first column (label) has been removed.
39 | 
40 | ====================================================
41 | 
42 | Format:
43 | 
44 | The columns are tab separeted with the following schema:
45 | <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
46 | 
47 | When a value is missing, the field is just empty.
48 | There is no label field in the test set.
49 | 
50 | ====================================================
51 | 
52 | Dataset assembled by Olivier Chapelle (o.chapelle@criteo.com)
53 | 
54 | 


--------------------------------------------------------------------------------
/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !sample*


--------------------------------------------------------------------------------
/logs/sample-inference-criteo_kaggle_5mil.txt:
--------------------------------------------------------------------------------
 1 | run pytorch ...
 2 | Create folder to store the model and ev-tables
 3 | :::MLLOG {"namespace": "", "time_ms": 1643656722925, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": true, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1091}}
 4 | :::MLLOG {"namespace": "", "time_ms": 1643656723001, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1093}}
 5 | world size: 1, current rank: 0, local rank: 0
 6 | 
 7 | ==================
 8 | Using 1 GPU(s)...
 9 | ==================
10 | 
11 | :::MLLOG {"namespace": "", "time_ms": 1643656724179, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1152}}
12 | :::MLLOG {"namespace": "", "time_ms": 1643656724180, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1154}}
13 | Loading criteo data
14 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> START make_criteo_data_and_loaders
15 | >>>> IS INFERENCE ONLY == True
16 | >>>> Loading Criteo kaggle dataset!
17 | >>>>     Start Loading TESTing data!
18 | >>>>     Loaded data for testing is LIMITED TO Reduce memory during inference test!
19 | 
20 | Reading pre-processed data=./input/criteo_kaggle_5mil/criteo_kaggle_5mil.npz
21 | file = ./input/criteo_kaggle_5mil/criteo_kaggle_5mil.npz
22 | Reduce the amount of data loaded! by cutting the array into new np.array
23 | y len = 50000
24 | Sparse fea = 26, Dense fea = 13
25 | len indices 1
26 | Defined split = test indices...
27 | ==== Not splitting the indices, use ALL!
28 | ==== test_indices = 50000   Should be reduced because only 40k will be used!
29 | >>>>     PyTorch Preparing test_loader for TESTing
30 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FINISH make_criteo_data_and_loaders
31 | TODO: we should read this from a file!!!
32 | command line args:  {"arch_sparse_feature_size": 36, "arch_embedding_size": "4-3-2", "arch_mlp_bot": "13-512-256-64-36", "arch_mlp_top": "512-256-1", "arch_interaction_op": "dot", "arch_interaction_itself": false, "weighted_pooling": null, "md_flag": false, "md_threshold": 200, "md_temperature": 0.3, "md_round_dims": false, "qr_flag": false, "qr_threshold": 200, "qr_operation": "mult", "qr_collisions": 4, "activation_function": "relu", "loss_function": "bce", "loss_weights": "1.0-1.0", "loss_threshold": 0.0, "round_targets": true, "data_size": 1, "num_batches": 0, "data_generation": "dataset", "rand_data_dist": "uniform", "rand_data_min": 0, "rand_data_max": 1, "rand_data_mu": -1, "rand_data_sigma": 1, "data_trace_file": "./input/dist_emb_j.log", "data_set": "kaggle", "raw_data_file": "./input/criteo_kaggle_5mil/train.txt", "processed_data_file": "./input/criteo_kaggle_5mil/criteo_kaggle_5mil.npz", "data_randomize": "total", "data_trace_enable_padding": false, "max_ind_range": -1, "data_sub_sample_rate": 0.0, "num_indices_per_lookup": 10, "num_indices_per_lookup_fixed": false, "num_workers": 0, "memory_map": false, "mini_batch_size": 128, "nepochs": 1, "learning_rate": 0.1, "print_precision": 5, "numpy_rand_seed": 123, "sync_dense_params": true, "optimizer": "sgd", "dataset_multiprocessing": false, "inference_only": true, "quantize_mlp_with_bit": 32, "quantize_emb_with_bit": 32, "save_onnx": false, "use_gpu": true, "local_rank": -1, "dist_backend": "", "print_freq": 1024, "test_freq": -1, "test_mini_batch_size": 1, "test_num_workers": 0, "print_time": true, "print_wall_time": false, "debug_mode": false, "enable_profiling": false, "plot_compute_graph": false, "tensor_board_filename": "run_kaggle_pt", "save_model": "", "load_model": "model.pth", "ev_path": "stored_model/criteo_kaggle_5mil/epoch-00/ev-table-32", "num_of_dp": 0, "mlperf_logging": true, "mlperf_acc_threshold": 0.0, "mlperf_auc_threshold": 0.0, "mlperf_bin_loader": false, "mlperf_bin_shuffle": false, "mlperf_grad_accum_iter": 1, "lr_num_warmup_steps": 0, "lr_decay_start_step": 0, "lr_num_decay_steps": 0, "ntest_per_epoch": 3, "percent_data_for_inference": 0.01, "input_data": "./input/criteo_kaggle_5mil", "ln_emb": [1396, 549, 1373639, 406655, 290, 21, 11862, 607, 3, 53574, 5173, 1156254, 3119, 26, 11689, 833957, 10, 4710, 2062, 4, 1015598, 17, 15, 95860, 90, 64259]}
33 | ==== ndevices = 1
34 | :::MLLOG {"namespace": "", "time_ms": 1643656736580, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "dlrm", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 89}}
35 | :::MLLOG {"namespace": "", "time_ms": 1643656736581, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "reference_implementation", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 94}}
36 | :::MLLOG {"namespace": "", "time_ms": 1643656736582, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 98}}
37 | :::MLLOG {"namespace": "", "time_ms": 1643656736582, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 102}}
38 | :::MLLOG {"namespace": "", "time_ms": 1643656736582, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "reference_implementation", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 106}}
39 | :::MLLOG {"namespace": "", "time_ms": 1643656736582, "event_type": "POINT_IN_TIME", "key": "submission_entry", "value": "reference_implementation", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 110}}
40 | :::MLLOG {"namespace": "", "time_ms": 1643656736582, "event_type": "POINT_IN_TIME", "key": "submission_poc_name", "value": "reference_implementation", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 114}}
41 | :::MLLOG {"namespace": "", "time_ms": 1643656736583, "event_type": "POINT_IN_TIME", "key": "submission_poc_email", "value": "reference_implementation", "metadata": {"file": "/mnt/extra/ev-store-dlrm/mlperf_logger.py", "lineno": 118}}
42 | :::MLLOG {"namespace": "", "time_ms": 1643656736583, "event_type": "POINT_IN_TIME", "key": "seed", "value": 123, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1448}}
43 | :::MLLOG {"namespace": "", "time_ms": 1643656736583, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 128, "metadata": {"file": "dlrm_s_pytorch.py", "lineno": 1451}}
44 | 


--------------------------------------------------------------------------------
/misc/README.txt:
--------------------------------------------------------------------------------
1 | We are not using caffe, so we put this caffe-related script here


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/.gitignore:
--------------------------------------------------------------------------------
1 | a.out


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/cache_manager.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EVLFU_CACHE_MANAGER_INCLUDED
 3 | #define EVLFU_CACHE_MANAGER_INCLUDED
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <list>
 8 | #include <unordered_map>
 9 | #include <unordered_set>
10 | #include <vector>
11 | #include <fstream>
12 | #include <stdio.h>
13 | #include <ctime>
14 | #include <unistd.h>
15 | #include <cstdlib>
16 | #include <cstring>
17 | #include <memory>
18 | #include <bitset>
19 | #include <cmath>
20 | #include <netinet/in.h>
21 | #include <pthread.h>
22 | #include <semaphore.h>
23 | #include <sys/epoll.h>  // for epoll_create1(), epoll_ctl(), struct epoll_event
24 | #include <sys/socket.h>
25 | #include <cassert>
26 | 
27 | using namespace std;
28 | 
29 | #define EV_DIMENSION 36
30 | #define N_EV_TABLE 26
31 | 
32 | #define PORT 8080
33 | #define MAX_EVENTS 5
34 | #define READ_SIZE 250
35 | #define MAX_WORKER_THREADS 10
36 | #define READ_HANDSHAKE_SIZE 3
37 | #define HANDSHAKE_MSG_LEN 51
38 | 
39 | int worker_statuses[MAX_WORKER_THREADS];  // if the worker is available, the value will be 0
40 | int add_new_client = 0;
41 | int new_client_conn;
42 | sem_t LOCK_WAITING_CLIENT_CONN;
43 | int incoming_buffer_length = 0; // will hold the incoming buff len of the new client
44 | 
45 | pthread_t thd_server_socket_conn;
46 | pthread_t thd_req_processor[MAX_WORKER_THREADS];
47 | 
48 | vector<bool> aggHitMissRecord = vector<bool>(N_EV_TABLE);
49 | vector<char * > emb_weights_in_chars = vector<char *>(N_EV_TABLE);
50 | float emb_weights_in_1d_floats[N_EV_TABLE * EV_DIMENSION];
51 | 
52 | // void init(int capacity);
53 | // void request_to_ev_lfu(vector<int>& group_keys, vector<bool>& arr_record_hit, 
54 | //         vector<char*>& arr_emb_weights, bool use_gpu);
55 | // void load_ev_tables();
56 | // void close_ev_tables();
57 | vector<string> split(const string& s, const string& delim);
58 | 
59 | #endif


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/evlfu_16.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_16_INCLUDED
  3 | #define EVLFU_16_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <semaphore.h>
 23 | #include "evlfu_8.hpp"
 24 | #include "evlfu_4.hpp"
 25 | 
 26 | using namespace std;
 27 | 
 28 | class EVLFU_16BIT{
 29 |     // TODO: Should use template to handle different precision?
 30 |     struct Cache_data {
 31 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 32 |         char *embedding_value;
 33 |         int agg_hit;
 34 |         ~Cache_data(){
 35 |             // Dan: This might not be needed
 36 |             if (!embedding_value) {
 37 |                 free(embedding_value);
 38 |             }
 39 |         }
 40 |     };
 41 | 
 42 |     #define EV_DIMENSION 36
 43 |     #define N_EV_TABLE 26
 44 |     #define BIT_PRECISION_16BIT 16 // this could be 32bit, 16bit, 8bit, or 4bit
 45 |     // BIT_Precision_16BIT impact the size of stored data
 46 | 
 47 |     // global variables:
 48 |     int cap_C1 = -1, min_C1 = 0;
 49 |     unordered_map<string, Cache_data> vals_C1;
 50 |     vector<unordered_set<string>> lists_C1;
 51 | 
 52 |     int n_perfect_item_C1 = 0,
 53 |     max_perfect_item_C1 = 0;
 54 |     double flush_rate_C1 = 0.3,
 55 |     perfect_item_cap_C1 = 0.95,
 56 |     max_perfect_item_cap_C1 = 0;
 57 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_16BIT / 8; // in Bytes
 58 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 59 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 60 | 
 61 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 62 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-16/binary/";
 63 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 64 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 65 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 66 |     
 67 |     // to handle the secondary caching layer (smaller precision)
 68 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 69 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 70 |     int c2_agg_hit;
 71 |     int c1_c2_agg_hit;
 72 |     EVLFU_8BIT *evlfu_8bit;
 73 |     EVLFU_4BIT *evlfu_4bit;
 74 |     int high_agghit_threshold = 23;
 75 | 
 76 |     // Multithreading
 77 |     #define N_THD__READ_EVTABLE_16BIT 3 // thread::hardware_concurrency(); // 48 threads
 78 |     struct Job_Reading_Evtable {
 79 |         int table_id;
 80 |         int row_id;
 81 |     };
 82 |     vector<thread> thds_pool__read_evtable;
 83 |     sem_t LOCK_WAITING_FOR_RESULT;
 84 |     sem_t LOCK_WAITING_OTHER_THDS;
 85 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_16BIT];  // if the worker is available, the value will be 0
 86 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 87 |     vector<char*> global__arr_missing_values;
 88 |     int STOP_ALL_THREADS = 0;
 89 |     int secondary_precision = -1;
 90 | 
 91 |   public:
 92 | 
 93 |     // multi layer caching 
 94 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 95 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 96 |   
 97 |     EVLFU_16BIT(int capacity, bool init_second_layer, int secondary_precision);
 98 |     ~EVLFU_16BIT();
 99 | 
100 |     // multithreading ev-reader
101 |     void init_thread_ev_reader();
102 |     void thd_loop_read_evtable(int thd_id);
103 |     void shutdown_evtable_reader_thds();
104 |     
105 |     void hello();
106 |     void check_size();
107 |     vector<string> split(const string& s, const string& delim);
108 |     void load_ev_tables();
109 |     void close_ev_tables();
110 |     void setKey(string& key, char *value, int agg_hit);
111 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
112 |     uint as_uint(const float x);
113 |     float as_float(const uint x);
114 |     void print_bits_short(const ushort x);
115 |     void print_bits(const float x);
116 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
117 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
118 |     float int_to_float(int value);
119 |     void print_ev_values(float *arr_floats);
120 |     void chars_buffer_to_floats_old_slow(char *buffer, float * floats);
121 |     void chars_buffer_to_floats(char *buffer, float * floats);
122 |     char* get_from_file(int table_id, int row_id);
123 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& c1_arr_record_hit, 
124 |         float * c1_c2_arr_emb_weights, int debug);
125 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
126 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
127 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
128 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float* c1_c2_arr_emb_weights, int debug);
129 |     vector<vector<int>> prepare_workload ();
130 | };
131 | 
132 | #endif


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/evlfu_32.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_32_INCLUDED
  3 | #define EVLFU_32_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <pthread.h>
 19 | #include <thread>
 20 | #include <semaphore.h>
 21 | #include "evlfu_16.hpp"
 22 | #include "evlfu_8.hpp"
 23 | #include "evlfu_4.hpp"
 24 | #include <chrono>
 25 | 
 26 | using namespace std;
 27 | class EVLFU_32BIT {
 28 |     struct Cache_data {
 29 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 30 |         float *embedding_value;
 31 |         int agg_hit;
 32 |         ~Cache_data(){
 33 |             // Dan: This might not be needed
 34 |             if (!embedding_value) {
 35 |                 free(embedding_value);
 36 |             }
 37 |         }
 38 |     };
 39 | 
 40 |     #define EV_DIMENSION 36
 41 |     #define N_EV_TABLE 26
 42 |     #define BIT_PRECISION_32BIT 32 // this could be 32bit, 16bit, 8bit, or 4bit
 43 |     // BIT_Precision_32BIT impact the size of stored data
 44 | 
 45 |     // global variables:
 46 |     int cap_C1 = -1, min_C1 = 0;
 47 |     unordered_map<string, Cache_data> vals_C1;
 48 |     vector<unordered_set<string>> lists_C1;
 49 | 
 50 |     int n_perfect_item_C1 = 0,
 51 |     max_perfect_item_C1 = 0;
 52 |     double flush_rate_C1 = 0.3,
 53 |     perfect_item_cap_C1 = 0.95,
 54 |     max_perfect_item_cap_C1 = 0;
 55 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_32BIT / 8; // in Bytes
 56 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 57 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 58 | 
 59 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 60 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 61 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 62 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 63 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 64 | 
 65 |     // to handle the secondary caching layer (smaller precision)
 66 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 67 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 68 |     int c2_agg_hit;
 69 |     int c1_c2_agg_hit;
 70 |     EVLFU_16BIT *evlfu_16bit;
 71 |     EVLFU_8BIT *evlfu_8bit;
 72 |     EVLFU_4BIT *evlfu_4bit;
 73 |     int high_agghit_threshold = 23;
 74 |     
 75 |     // Multithreading
 76 |     #define N_THD__READ_EVTABLE_32BIT 3 // thread::hardware_concurrency(); // 48 threads
 77 |     struct Job_Reading_Evtable {
 78 |         int table_id;
 79 |         int row_id; // to mark that this is the last job; to check whether all other jobs are done
 80 |     };
 81 |     vector<thread> thds_pool__read_evtable;
 82 |     sem_t LOCK_WAITING_FOR_RESULT;
 83 |     sem_t LOCK_WAITING_OTHER_THDS;
 84 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_32BIT];  // if the worker is available, the value will be 0
 85 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 86 |     vector<float*> global__arr_missing_values;
 87 |     int STOP_ALL_THREADS = 0;
 88 |     int secondary_precision = -1;
 89 | 
 90 |   public:
 91 |     EVLFU_32BIT(int capacity, bool init_second_layer, int secondary_precision);
 92 |     ~EVLFU_32BIT();
 93 | 
 94 |     // multithreading ev-reader
 95 |     void init_thread_ev_reader();
 96 |     void thd_loop_read_evtable(int thd_id);
 97 |     void shutdown_evtable_reader_thds();
 98 |     
 99 |     void hello();
100 |     void check_size();
101 |     vector<string> split(const string& s, const string& delim);
102 |     void load_ev_tables();
103 |     void close_ev_tables();
104 |     void setKey(string& key, float *value, int agg_hit);
105 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit, int debug);
106 |     uint as_uint(const float x);
107 |     float as_float(const uint x);
108 |     void print_bits_short(const ushort x);
109 |     void print_bits(const float x);
110 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
111 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
112 |     float int_to_float(int value);
113 |     void print_ev_values(float *arr_floats);
114 |     float* get_from_file(int table_id, int row_id);
115 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, float *emb_weights_in_1d_floats);
116 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, float * arr_emb_weights, int debug);
117 |     vector<vector<int>> prepare_workload ();
118 | };
119 | 
120 | #endif


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/evlfu_4.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_4BIT_INCLUDED
  3 | #define EVLFU_4BIT_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <semaphore.h>
 23 | 
 24 | using namespace std;
 25 | class EVLFU_4BIT {
 26 |   // TODO: Should use template to handle different precision?
 27 |     struct Cache_data {
 28 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 29 |         char *embedding_value;
 30 |         int agg_hit;
 31 |         ~Cache_data(){
 32 |             // Dan: This might not be needed
 33 |             if (!embedding_value) {
 34 |                 free(embedding_value);
 35 |             }
 36 |         }
 37 |     };
 38 |     
 39 |     #define EV_DIMENSION 36
 40 |     #define N_EV_TABLE 26
 41 |     #define BIT_PRECISION_4BIT 4 // this could be 32bit, 16bit, 8bit, or 4bit
 42 |     // BIT_Precision_4BIT impact the size of stored data
 43 | 
 44 |     int cap_C1 = -1, min_C1 = 0;
 45 |     unordered_map<string, Cache_data> vals_C1;
 46 |     vector<unordered_set<string>> lists_C1;
 47 | 
 48 |     int n_perfect_item_C1 = 0,
 49 |     max_perfect_item_C1 = 0;
 50 |     double flush_rate_C1 = 0.3,
 51 |     perfect_item_cap_C1 = 0.95,
 52 |     max_perfect_item_cap_C1 = 0;
 53 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_4BIT / 8; // in Bytes
 54 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 55 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 56 | 
 57 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 58 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-4/binary/";
 59 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 60 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 61 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 62 | 
 63 |     // to handle the secondary caching layer (smaller precision)
 64 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 65 | 
 66 |     // Multithreading
 67 |     #define N_THD__READ_EVTABLE_4BIT 3 // thread::hardware_concurrency(); // 48 threads
 68 |     struct Job_Reading_Evtable {
 69 |         int table_id;
 70 |         int row_id;
 71 |     };
 72 |     vector<thread> thds_pool__read_evtable;
 73 |     sem_t LOCK_WAITING_FOR_RESULT;
 74 |     sem_t LOCK_WAITING_OTHER_THDS;
 75 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_4BIT];  // if the worker is available, the value will be 0
 76 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 77 |     vector<char*> global__arr_missing_values;
 78 |     int STOP_ALL_THREADS = 0;
 79 |     int secondary_precision = -1;
 80 | 
 81 | public:
 82 |     // multi layer caching 
 83 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 84 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 85 |   
 86 |     EVLFU_4BIT(int capacity, bool init_second_layer, int secondary_precision);
 87 |     ~EVLFU_4BIT();
 88 | 
 89 |     // multithreading ev-reader
 90 |     void init_thread_ev_reader();
 91 |     void thd_loop_read_evtable(int thd_id);
 92 |     void shutdown_evtable_reader_thds();
 93 |     
 94 |     void hello();
 95 |     void check_size();
 96 |     vector<string> split(const string& s, const string& delim);
 97 |     void load_ev_tables();
 98 |     void close_ev_tables();
 99 |     void setKey(string& key, char *value, int agg_hit);
100 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
101 |     uint as_uint(const float x);
102 |     float as_float(const uint x);
103 |     void print_bits_short(const ushort x);
104 |     void print_bits(const float x);
105 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
106 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
107 |     float int_to_float(int value);
108 |     void print_ev_values(float *arr_floats);
109 |     void chars_buffer_to_floats(char *buffer, float * floats);
110 |     char* get_from_file(int table_id, int row_id);
111 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
112 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
113 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
114 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float * c1_c2_arr_emb_weights, int debug);
115 |     vector<vector<int>> prepare_workload ();
116 | };
117 | #endif


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/evlfu_8.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_8BIT_INCLUDED
  3 | #define EVLFU_8BIT_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <semaphore.h>
 23 | #include "evlfu_4.hpp"
 24 | 
 25 | using namespace std;
 26 | class EVLFU_8BIT {
 27 |     struct Cache_data {
 28 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 29 |         char *embedding_value;
 30 |         int agg_hit;
 31 |         ~Cache_data(){
 32 |             // Dan: This might not be needed
 33 |             if (!embedding_value) {
 34 |                 free(embedding_value);
 35 |             }
 36 |         }
 37 |     };
 38 | 
 39 |     #define EV_DIMENSION 36
 40 |     #define N_EV_TABLE 26
 41 |     #define BIT_PRECISION_8BIT 8 // this could be 32bit, 16bit, 8bit, or 4bit
 42 |     int cap_C1 = -1, min_C1 = 0;
 43 |     unordered_map<string, Cache_data> vals_C1;
 44 |     vector<unordered_set<string>> lists_C1;
 45 | 
 46 |     int n_perfect_item_C1 = 0,
 47 |     max_perfect_item_C1 = 0;
 48 |     double flush_rate_C1 = 0.3,
 49 |     perfect_item_cap_C1 = 0.95,
 50 |     max_perfect_item_cap_C1 = 0;
 51 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_8BIT / 8; // in Bytes
 52 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 53 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 54 | 
 55 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 56 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-8/binary/";
 57 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 58 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 59 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 60 | 
 61 |     // to handle the secondary caching layer (smaller precision)
 62 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 63 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 64 |     int c2_agg_hit;
 65 |     int c1_c2_agg_hit;
 66 |     EVLFU_4BIT *evlfu_4bit;
 67 |     int high_agghit_threshold = 23;
 68 | 
 69 |     // Multithreading
 70 |     #define N_THD__READ_EVTABLE_8BIT 3 // thread::hardware_concurrency(); // 48 threads
 71 |     struct Job_Reading_Evtable {
 72 |         int table_id;
 73 |         int row_id;
 74 |     };
 75 |     vector<thread> thds_pool__read_evtable;
 76 |     sem_t LOCK_WAITING_FOR_RESULT;
 77 |     sem_t LOCK_WAITING_OTHER_THDS;
 78 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_8BIT];  // if the worker is available, the value will be 0
 79 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 80 |     vector<char*> global__arr_missing_values;
 81 |     int STOP_ALL_THREADS = 0;
 82 |     int secondary_precision = -1;
 83 | 
 84 | public:
 85 |     // multi layer caching 
 86 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 87 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 88 |   
 89 |     EVLFU_8BIT(int capacity, bool init_second_layer, int secondary_precision);
 90 |     ~EVLFU_8BIT();
 91 | 
 92 |     // multithreading ev-reader
 93 |     void init_thread_ev_reader();
 94 |     void thd_loop_read_evtable(int thd_id);
 95 |     void shutdown_evtable_reader_thds();
 96 |     
 97 |     void hello();
 98 |     void check_size();
 99 |     vector<string> split(const string& s, const string& delim);
100 |     void load_ev_tables();
101 |     void close_ev_tables();
102 |     void setKey(string& key, char *value, int agg_hit);
103 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
104 |     uint as_uint(const float x);
105 |     float as_float(const uint x);
106 |     void print_bits_short(const ushort x);
107 |     void print_bits(const float x);
108 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
109 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
110 |     float int_to_float(int value);
111 |     void print_ev_values(float *arr_floats);
112 |     void chars_buffer_to_floats(char *buffer, float * floats);
113 |     char* get_from_file(int table_id, int row_id);
114 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& c1_arr_record_hit, 
115 |         float * c1_c2_arr_emb_weights, int debug);
116 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
117 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
118 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
119 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float * c1_c2_arr_emb_weights, int debug);
120 |     vector<vector<int>> prepare_workload ();
121 | };
122 | #endif


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/readme.txt:
--------------------------------------------------------------------------------
1 | Support mixed precision or single precision
2 | 4bit, 8bit, 16bit, 32bit


--------------------------------------------------------------------------------
/misc/mixed_precs_caching_v0/test.cpp:
--------------------------------------------------------------------------------
 1 | // CPP program to illustrate the
 2 | // unordered_set::erase() function
 3 | #include <iostream>
 4 | #include <string>
 5 | #include <list>
 6 | #include <unordered_map>
 7 | #include <unordered_set>
 8 | #include <vector>
 9 | #include <fstream>
10 | #include <stdio.h>
11 | #include <ctime>
12 | #include <unistd.h>
13 | #include <cstdlib>
14 | #include <cstring>
15 | #include <memory>
16 | #include <pthread.h>
17 | #include <thread>
18 | using namespace std;
19 | 
20 | int * test1(int *d){
21 |     int *i = d;
22 |     return i;
23 |     // return NULL;
24 | } 
25 | 
26 | // g++ -O3 test.cpp -pthread; ./a.out
27 | float cpy_float[30];
28 | 
29 | int main()
30 | {
31 |     float real_float = 0.002345f;
32 |     float arr_floats[] = {0.002345f, 0.00023f};
33 |     float *fl;
34 |     fl = arr_floats;
35 |     
36 |     memcpy(cpy_float + 1, fl, 8);
37 |     printf("float real_float %f\n", real_float);
38 |     printf("float *fl %f\n", *fl);
39 |     printf("float cpy_float %f\n", cpy_float[0]);
40 |     printf("float cpy_float %f\n", cpy_float[1]);
41 | 
42 |     exit(-1);
43 |     int f = 4;  
44 |     int *x = test1(&f);
45 |     if (x != NULL)
46 |         printf("x %d\n", *x);
47 |     unordered_set<string> sampleSet = { "geeks1", "for", "geeks2", "ggeee" };
48 |     // unordered_map<string> sampleMap = { "geeks1":2, "geeks1":3, "geeks2":4, "ggeee":2 };
49 |  
50 |     // erases a particular element
51 |     sampleSet.erase("geeks1");
52 |  
53 |     // displaying the set after removal
54 |     cout << "Elements: ";
55 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
56 |         cout << *it << " ";
57 |     }
58 |  
59 |     sampleSet.insert("geeks1");
60 |     // erases from where for is
61 |     sampleSet.erase(sampleSet.find("for"), sampleSet.end());
62 |     string str = "sss";
63 |     // displaying the set after removal
64 |     cout << "\nAfter second removal set : \n";
65 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
66 |         printf("key_to_evict = %s\n", (*it).c_str());
67 |         str = *it;
68 |         // cout << *it << " ";
69 |     }
70 |     sampleSet.erase(str);
71 |  
72 |     cout << "\nAfter second removal set :\n";
73 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
74 |         printf("key_to_evict = %s\n", (*it).c_str());
75 |     }
76 | 
77 |     float* missing_value = (float*) malloc(sizeof(float) * 36);
78 |     missing_value[0] = 1.23;
79 |     printf("test %f\n", missing_value[0]);
80 |     return 0;
81 | }


--------------------------------------------------------------------------------
/misc/testing_tensor_cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 2 | project(example-app)
 3 | 
 4 | find_package(Torch REQUIRED)
 5 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 6 | if(NOT CMAKE_BUILD_TYPE)
 7 |   set(CMAKE_BUILD_TYPE Release)
 8 | endif()
 9 | 
10 | # set(CMAKE_CXX_FLAGS "-Wall -Wextra")
11 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
12 | set(CMAKE_CXX_FLAGS_RELEASE "-O3")
13 | add_executable(example-app evlfu_tensor.cpp)
14 | target_link_libraries(example-app "${TORCH_LIBRARIES}")
15 | set_property(TARGET example-app PROPERTY CXX_STANDARD 14)
16 | 
17 | # The following code block is suggested to be used on Windows.
18 | # According to https://github.com/pytorch/pytorch/issues/25457,
19 | # the DLLs need to be copied to avoid memory errors.
20 | if (MSVC)
21 |   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
22 |   add_custom_command(TARGET example-app
23 |                      POST_BUILD
24 |                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
25 |                      ${TORCH_DLLS}
26 |                      $<TARGET_FILE_DIR:example-app>)
27 | endif (MSVC)
28 | 
29 | 


--------------------------------------------------------------------------------
/misc/testing_tensor_cpp/evlfu_tensor.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EVLFU_TENSOR_INCLUDED
 3 | #define EVLFU_TENSOR_INCLUDED
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <list>
 8 | #include <unordered_map>
 9 | #include <vector>
10 | #include <fstream>
11 | #include <ctime>
12 | #include <torch/torch.h>
13 | 
14 | using namespace std;
15 | 
16 | struct Cache_data
17 | {
18 |     Cache_data(vector<float> ev = vector<float>(0), int agg_hit = 0)
19 |     {
20 |         this->embedding_value = ev;
21 |         this->agg_hit = agg_hit;
22 |     }
23 |     vector<float> embedding_value;
24 |     int agg_hit;
25 | };
26 | 
27 | void init(int capacity);
28 | void request_to_ev_lfu(vector<int> &group_keys, vector<bool> &arr_record_hit, vector<vector<float>> &arr_emb_weights, bool use_gpu);
29 | void load_ev_tables();
30 | void close_ev_tables();
31 | 
32 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/.gitignore:
--------------------------------------------------------------------------------
1 | a.out


--------------------------------------------------------------------------------
/mixed_precs_caching/aprx_embedding.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef APRX_EV_INCLUDED
  3 | #define APRX_EV_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <queue>          // std::queue
  9 | #include <unordered_map>
 10 | #include <unordered_set>
 11 | #include <vector>
 12 | #include <fstream>
 13 | #include <stdio.h>
 14 | #include <ctime>
 15 | #include <unistd.h>
 16 | #include <cstdlib>
 17 | #include <cstring>
 18 | #include <memory>
 19 | #include <bitset>
 20 | #include <cmath>
 21 | #include <pthread.h>
 22 | #include <thread>
 23 | #include <semaphore.h>
 24 | #include <cassert>
 25 | 
 26 | using namespace std;
 27 | 
 28 | class APRX_EV{
 29 |     #define N_EV_TABLE          26
 30 |     #define IO_JOB_Q_SIZE       50
 31 |     #define N_BATCH             10 // the batch idx vector is to handle parallel data insert/delete by different threads
 32 |     #define C3_EVICTION_METHOD   2 // 1 for basic_eviction(); 2 for recency-aware eviction
 33 |     
 34 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/10n-euclid/binary/";
 35 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/10n-cosine/binary/";
 36 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/100n-euclid/binary/";
 37 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/100n-euclid003/binary/";
 38 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/1000n-euclid003/binary/";
 39 |     string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/1000n-euclid003-newrank/binary/";
 40 |     // string APRX_EV_FILE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all/alternative-keys/2000n-euclid003/binary/";
 41 | 
 42 |     struct C3_Value {
 43 |         uint32_t alt_key;
 44 |         bool recency_flag; // will be true if the value was recently used and got hit
 45 |     };
 46 | 
 47 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 48 |     int TOTAL_BYTE_PER_ROW = 4; // in Bytes to read the unsigned int
 49 | 
 50 |     int cap_C3 = -1;
 51 |     unordered_map<string, C3_Value> vals_C3; // the alternative key is stored as unsigned integer to save space
 52 |     queue<string> lists_C3; // to store the LRU order of the keys; can be popped easily (help the eviction)
 53 | 
 54 |     // Multithreading
 55 |     #define N_THD__READ_ALTKEY_FILE 5 // thread::hardware_concurrency(); // 48 threads
 56 |     struct Job_Reading_Altkey {
 57 |         string key; // so that C1 and C2 can insert a string 
 58 |         int table_id;
 59 |         int row_id;
 60 |     };
 61 | 
 62 |     struct KV_Altkey {
 63 |         string key;
 64 |         uint32_t value;
 65 |     };
 66 | 
 67 |     vector<thread> thds_pool__read_altkey_file;
 68 |     sem_t LOCK_WAITING_OTHER_THDS;
 69 |     sem_t LOCK_THD_STATUSES[N_THD__READ_ALTKEY_FILE];  // if the worker is available, the value will be 0
 70 |     int curr_batch = -1; 
 71 |     int idx_of_ready_batch = -1;
 72 |     queue<int> arr_idx_of_ready_batch;
 73 |     vector< KV_Altkey > global__kv_altkey_to_insert =  vector<KV_Altkey>(IO_JOB_Q_SIZE);; // TODO: Don't forget the clear this once it is being inserted
 74 |     vector < vector<Job_Reading_Altkey>> queue_job__read_altkey = vector< vector< Job_Reading_Altkey >> (N_BATCH);;
 75 |     int STOP_ALL_THREADS = 0;
 76 | 
 77 | public:
 78 | 
 79 |     sem_t LOCK_WAITING_FOR_NEXT_BATCH; // make this public so that the main() can test it
 80 |     void thd_loop_read_altkey_file(int thd_id);
 81 | 
 82 |     APRX_EV(int capacity);
 83 |     void init_thread_file_reader();
 84 |     void open_aprx_ev_files();
 85 |     unsigned char* get_from_file_as_bytes(int table_id, int row_id);
 86 |     uint32_t get_from_file_as_uint(int table_id, int row_id);
 87 |     uint32_t convert_bytes_to_altkey(unsigned char* bytes_buff );
 88 |     void convert_bytes_to_altkey(unsigned char* bytes_buff, int *alt_table_id, int *alt_row_id );
 89 |     string convert_altkey_to_str(int alt_table_id, int alt_row_id);
 90 |     string convert_bytes_to_altkey_str(unsigned char* bytes_buff );
 91 |     void insert_altkey(string key, uint32_t alt_key);
 92 |     void insert_altkey_batched(vector<string> arr_keys, vector<uint32_t> arr_altkeys);
 93 |     void insert_altkey_batched_obj();
 94 |     void get_altkey(string key, int *alt_table_id, int *alt_row_id);
 95 |     string get_altkey_str(string key);
 96 |     vector<string> request_c3(vector<string>& arr_keys);
 97 |     void check_curr_batch_size();
 98 |     void add_key_to_batched_io(int table_id, int row_id);
 99 |     void add_arr_keys_to_batched_io(vector<string> arr_keys);
100 |     void print_all_keys_in_c3();
101 |     void printQueue(queue<string> q);
102 |     void wake_all_threads_up();
103 |     void set_recency_flag_c3(string key);
104 |     void evict_one_key();
105 |     void basic_eviction();
106 |     void recency_aware_eviction();
107 | };
108 | 
109 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/cache_manager.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef EVLFU_CACHE_MANAGER_INCLUDED
 3 | #define EVLFU_CACHE_MANAGER_INCLUDED
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <list>
 8 | #include <unordered_map>
 9 | #include <unordered_set>
10 | #include <vector>
11 | #include <fstream>
12 | #include <stdio.h>
13 | #include <ctime>
14 | #include <unistd.h>
15 | #include <cstdlib>
16 | #include <cstring>
17 | #include <memory>
18 | #include <bitset>
19 | #include <cmath>
20 | #include <netinet/in.h>
21 | #include <pthread.h>
22 | #include <semaphore.h>
23 | #include <sys/epoll.h>  // for epoll_create1(), epoll_ctl(), struct epoll_event
24 | #include <sys/socket.h>
25 | #include <cassert>
26 | #include <iterator>
27 | 
28 | using namespace std;
29 | 
30 | #define EV_DIMENSION 36
31 | #define N_EV_TABLE 26
32 | 
33 | #define PORT 8080
34 | #define MAX_EVENTS 5
35 | #define READ_SIZE 250
36 | #define MAX_WORKER_THREADS 10
37 | #define READ_HANDSHAKE_SIZE 3
38 | #define HANDSHAKE_MSG_LEN 51
39 | 
40 | int worker_statuses[MAX_WORKER_THREADS];  // if the worker is available, the value will be 0
41 | int add_new_client = 0;
42 | int new_client_conn;
43 | sem_t LOCK_WAITING_CLIENT_CONN;
44 | int incoming_buffer_length = 0; // will hold the incoming buff len of the new client
45 | 
46 | pthread_t thd_server_socket_conn;
47 | pthread_t thd_req_processor[MAX_WORKER_THREADS];
48 | 
49 | vector<bool> aggHitMissRecord = vector<bool>(N_EV_TABLE);
50 | vector<char * > emb_weights_in_chars = vector<char *>(N_EV_TABLE);
51 | float emb_weights_in_1d_floats[N_EV_TABLE * EV_DIMENSION];
52 | 
53 | // void init(int capacity);
54 | // void request_to_ev_lfu(vector<int>& group_keys, vector<bool>& arr_record_hit, 
55 | //         vector<char*>& arr_emb_weights, bool use_gpu);
56 | // void load_ev_tables();
57 | // void close_ev_tables();
58 | vector<string> split(const string& s, const string& delim);
59 | 
60 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/evlfu_16.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_16_INCLUDED
  3 | #define EVLFU_16_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <semaphore.h>
 23 | #include <chrono>
 24 | #include "evlfu_8.hpp"
 25 | #include "evlfu_4.hpp"
 26 | #include "aprx_embedding.hpp"
 27 | 
 28 | using namespace std;
 29 | 
 30 | class EVLFU_16BIT{
 31 |     // TODO: Should use template to handle different precision?
 32 |     struct Cache_data {
 33 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 34 |         char *embedding_value;
 35 |         int agg_hit;
 36 |         ~Cache_data(){
 37 |             // Dan: This might not be needed
 38 |             if (!embedding_value) {
 39 |                 free(embedding_value);
 40 |             }
 41 |         }
 42 |     };
 43 | 
 44 |     #define EV_DIMENSION 36
 45 |     #define N_EV_TABLE 26
 46 |     #define BIT_PRECISION_16BIT 16 // this could be 32bit, 16bit, 8bit, or 4bit
 47 |     // BIT_Precision_16BIT impact the size of stored data
 48 | 
 49 |     // global variables:
 50 |     int cap_C1 = -1, min_C1 = 0;
 51 |     unordered_map<string, Cache_data> vals_C1;
 52 |     vector<unordered_set<string>> lists_C1;
 53 | 
 54 |     int n_perfect_item_C1 = 0,
 55 |     max_perfect_item_C1 = 0;
 56 |     double flush_rate_C1 = 0.3,
 57 |     perfect_item_cap_C1 = 0.95,
 58 |     max_perfect_item_cap_C1 = 0;
 59 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_16BIT / 8; // in Bytes
 60 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 61 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 62 | 
 63 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 64 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-16/binary/";
 65 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 66 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 67 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 68 |     
 69 |     // to handle the secondary caching layer (smaller precision)
 70 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 71 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 72 |     int c2_agg_hit;
 73 |     int c1_c2_agg_hit;
 74 |     EVLFU_8BIT *evlfu_8bit;
 75 |     EVLFU_4BIT *evlfu_4bit;
 76 |     int high_agghit_threshold = 23;
 77 | 
 78 |     // Multithreading
 79 |     #define N_THD__READ_EVTABLE_16BIT 3 // thread::hardware_concurrency(); // 48 threads
 80 |     struct Job_Reading_Evtable {
 81 |         int table_id;
 82 |         int row_id;
 83 |     };
 84 |     vector<thread> thds_pool__read_evtable;
 85 |     sem_t LOCK_WAITING_FOR_RESULT;
 86 |     sem_t LOCK_WAITING_OTHER_THDS;
 87 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_16BIT];  // if the worker is available, the value will be 0
 88 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 89 |     vector<char*> global__arr_missing_values;
 90 |     int STOP_ALL_THREADS = 0;
 91 |     int secondary_precision = -1;
 92 | 
 93 |   public:
 94 | 
 95 |     // multi layer caching 
 96 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 97 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 98 |     vector<string> shared__arr_evicted_keys;
 99 |     int aprx_ev_hit = 0;
100 |     bool is_c3_active = false;
101 |   
102 |     EVLFU_16BIT(int capacity, bool init_second_layer, int secondary_precision);
103 |     ~EVLFU_16BIT();
104 | 
105 |     // multithreading ev-reader
106 |     void init_thread_ev_reader();
107 |     void thd_loop_read_evtable(int thd_id);
108 |     void shutdown_evtable_reader_thds();
109 |     
110 |     void hello();
111 |     void check_size();
112 |     vector<string> split(const string& s, const string& delim);
113 |     void load_ev_tables();
114 |     void close_ev_tables();
115 |     void setKey(string& key, char *value, int agg_hit);
116 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
117 |     uint as_uint(const float x);
118 |     float as_float(const uint x);
119 |     void print_bits_short(const ushort x);
120 |     void print_bits(const float x);
121 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
122 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
123 |     float int_to_float(int value);
124 |     void print_ev_values(float *arr_floats);
125 |     void chars_buffer_to_floats_old_slow(char *buffer, float * floats);
126 |     void chars_buffer_to_floats(char *buffer, float * floats);
127 |     char* get_from_file(int table_id, int row_id);
128 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& c1_arr_record_hit, 
129 |         float * c1_c2_arr_emb_weights, int debug);
130 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
131 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
132 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
133 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float* c1_c2_arr_emb_weights, int debug);
134 |     vector<vector<int>> prepare_workload ();
135 |     char * request_by_key(string key);
136 | };
137 | 
138 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/evlfu_32.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_32_INCLUDED
  3 | #define EVLFU_32_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <pthread.h>
 19 | #include <thread>
 20 | #include <semaphore.h>
 21 | #include "evlfu_16.hpp"
 22 | #include "evlfu_8.hpp"
 23 | #include "evlfu_4.hpp"
 24 | #include <chrono>
 25 | #include "aprx_embedding.hpp"
 26 | 
 27 | using namespace std;
 28 | class EVLFU_32BIT {
 29 |     struct Cache_data {
 30 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 31 |         float *embedding_value;
 32 |         int agg_hit;
 33 |         ~Cache_data(){
 34 |             // Dan: This might not be needed
 35 |             if (!embedding_value) {
 36 |                 free(embedding_value);
 37 |             }
 38 |         }
 39 |     };
 40 | 
 41 |     #define EV_DIMENSION 36
 42 |     #define N_EV_TABLE 26
 43 |     #define BIT_PRECISION_32BIT 32 // this could be 32bit, 16bit, 8bit, or 4bit
 44 |     // BIT_Precision_32BIT impact the size of stored data
 45 | 
 46 |     // global variables:
 47 |     int cap_C1 = -1, min_C1 = 0;
 48 |     unordered_map<string, Cache_data> vals_C1;
 49 |     vector<unordered_set<string>> lists_C1;
 50 | 
 51 |     int n_perfect_item_C1 = 0,
 52 |     max_perfect_item_C1 = 0;
 53 |     double flush_rate_C1 = 0.3,
 54 |     perfect_item_cap_C1 = 0.95,
 55 |     max_perfect_item_cap_C1 = 0;
 56 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_32BIT / 8; // in Bytes
 57 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 58 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 59 | 
 60 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 61 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 62 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 63 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 64 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 65 | 
 66 |     // to handle the secondary caching layer (smaller precision)
 67 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 68 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 69 |     int c2_agg_hit;
 70 |     int c1_c2_agg_hit;
 71 |     EVLFU_16BIT *evlfu_16bit;
 72 |     EVLFU_8BIT *evlfu_8bit;
 73 |     EVLFU_4BIT *evlfu_4bit;
 74 |     int high_agghit_threshold = 23;
 75 |     
 76 |     // Multithreading
 77 |     #define N_THD__READ_EVTABLE_32BIT 3 // thread::hardware_concurrency(); // 48 threads
 78 |     struct Job_Reading_Evtable {
 79 |         int table_id;
 80 |         int row_id; // to mark that this is the last job; to check whether all other jobs are done
 81 |     };
 82 |     vector<thread> thds_pool__read_evtable;
 83 |     sem_t LOCK_WAITING_FOR_RESULT;
 84 |     sem_t LOCK_WAITING_OTHER_THDS;
 85 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_32BIT];  // if the worker is available, the value will be 0
 86 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 87 |     vector<float*> global__arr_missing_values;
 88 |     int STOP_ALL_THREADS = 0;
 89 |     int secondary_precision = -1;
 90 | 
 91 |   public:
 92 |     vector<string> shared__arr_evicted_keys;
 93 |     int aprx_ev_hit = 0;
 94 |     bool is_c3_active = false;
 95 | 
 96 |     EVLFU_32BIT(int capacity, bool init_second_layer, int secondary_precision);
 97 |     ~EVLFU_32BIT();
 98 | 
 99 |     // multithreading ev-reader
100 |     void init_thread_ev_reader();
101 |     void thd_loop_read_evtable(int thd_id);
102 |     void shutdown_evtable_reader_thds();
103 |     
104 |     void hello();
105 |     void check_size();
106 |     vector<string> split(const string& s, const string& delim);
107 |     void load_ev_tables();
108 |     void close_ev_tables();
109 |     void setKey(string& key, float *value, int agg_hit);
110 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit, int debug);
111 |     uint as_uint(const float x);
112 |     float as_float(const uint x);
113 |     void print_bits_short(const ushort x);
114 |     void print_bits(const float x);
115 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
116 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
117 |     float int_to_float(int value);
118 |     void print_ev_values(float *arr_floats);
119 |     float* get_from_file(int table_id, int row_id);
120 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, float *emb_weights_in_1d_floats);
121 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, float * arr_emb_weights, int debug);
122 |     vector<vector<int>> prepare_workload ();
123 |     float * request_by_key(string key);
124 | };
125 | 
126 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/evlfu_4.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_4BIT_INCLUDED
  3 | #define EVLFU_4BIT_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <semaphore.h>
 23 | #include <chrono>
 24 | 
 25 | using namespace std;
 26 | class EVLFU_4BIT {
 27 |   // TODO: Should use template to handle different precision?
 28 |     struct Cache_data {
 29 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 30 |         char *embedding_value;
 31 |         int agg_hit;
 32 |         ~Cache_data(){
 33 |             // Dan: This might not be needed
 34 |             if (!embedding_value) {
 35 |                 free(embedding_value);
 36 |             }
 37 |         }
 38 |     };
 39 |     
 40 |     #define EV_DIMENSION 36
 41 |     #define N_EV_TABLE 26
 42 |     #define BIT_PRECISION_4BIT 4 // this could be 32bit, 16bit, 8bit, or 4bit
 43 |     // BIT_Precision_4BIT impact the size of stored data
 44 |     
 45 |     // float value_mapping[15] = {0.5, 0.1, 0.07, 0.04, 0.01, 0.001, -0.0001, 0, -0.5, -0.1, -0.07, -0.04, -0.01, -0.001, -0.0001};
 46 |     float value_mapping[15] = {1, 0.8, 0.6, 0.4, 0.0625, 0.00390625, 0.0000153, 0, -0.0000153, -0.00390625, -0.0625, -0.4, -0.6, -0.8, -1};
 47 |     int cap_C1 = -1, min_C1 = 0;
 48 |     unordered_map<string, Cache_data> vals_C1;
 49 |     vector<unordered_set<string>> lists_C1;
 50 | 
 51 |     int n_perfect_item_C1 = 0,
 52 |     max_perfect_item_C1 = 0;
 53 |     double flush_rate_C1 = 0.3,
 54 |     perfect_item_cap_C1 = 0.95,
 55 |     max_perfect_item_cap_C1 = 0;
 56 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_4BIT / 8; // in Bytes
 57 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 58 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 59 | 
 60 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 61 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-4/binary/";
 62 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 63 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 64 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 65 | 
 66 |     // to handle the secondary caching layer (smaller precision)
 67 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 68 | 
 69 |     // Multithreading
 70 |     #define N_THD__READ_EVTABLE_4BIT 3 // thread::hardware_concurrency(); // 48 threads
 71 |     struct Job_Reading_Evtable {
 72 |         int table_id;
 73 |         int row_id;
 74 |     };
 75 |     vector<thread> thds_pool__read_evtable;
 76 |     sem_t LOCK_WAITING_FOR_RESULT;
 77 |     sem_t LOCK_WAITING_OTHER_THDS;
 78 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_4BIT];  // if the worker is available, the value will be 0
 79 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 80 |     vector<char*> global__arr_missing_values;
 81 |     int STOP_ALL_THREADS = 0;
 82 |     int secondary_precision = -1;
 83 | 
 84 | public:
 85 |     // multi layer caching 
 86 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 87 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 88 |     vector<string> shared__arr_evicted_keys;
 89 |     bool is_c3_active = false;
 90 |   
 91 |     EVLFU_4BIT(int capacity, bool init_second_layer, int secondary_precision);
 92 |     ~EVLFU_4BIT();
 93 | 
 94 |     // multithreading ev-reader
 95 |     void init_thread_ev_reader();
 96 |     void thd_loop_read_evtable(int thd_id);
 97 |     void shutdown_evtable_reader_thds();
 98 |     
 99 |     void hello();
100 |     void check_size();
101 |     vector<string> split(const string& s, const string& delim);
102 |     void load_ev_tables();
103 |     void close_ev_tables();
104 |     void setKey(string& key, char *value, int agg_hit);
105 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
106 |     uint as_uint(const float x);
107 |     float as_float(const uint x);
108 |     void print_bits_short(const ushort x);
109 |     void print_bits(const float x);
110 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
111 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
112 |     float int_to_float(int value);
113 |     void print_ev_values(float *arr_floats);
114 |     void chars_buffer_to_floats(char *buffer, float * floats);
115 |     char* get_from_file(int table_id, int row_id);
116 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
117 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
118 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
119 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float * c1_c2_arr_emb_weights, int debug);
120 |     vector<vector<int>> prepare_workload ();
121 |     char * request_by_key(string key);
122 | };
123 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/evlfu_8.hpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef EVLFU_8BIT_INCLUDED
  3 | #define EVLFU_8BIT_INCLUDED
  4 | 
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <unordered_set>
 10 | #include <vector>
 11 | #include <fstream>
 12 | #include <stdio.h>
 13 | #include <ctime>
 14 | #include <unistd.h>
 15 | #include <cstdlib>
 16 | #include <cstring>
 17 | #include <memory>
 18 | #include <bitset>
 19 | #include <cmath>
 20 | #include <pthread.h>
 21 | #include <thread>
 22 | #include <chrono>
 23 | #include <semaphore.h>
 24 | #include "evlfu_4.hpp"
 25 | #include "aprx_embedding.hpp"
 26 | 
 27 | using namespace std;
 28 | class EVLFU_8BIT {
 29 |     struct Cache_data {
 30 |         // Store 32bit float; TODO: Must change this if the precision is not 32bit!
 31 |         char *embedding_value;
 32 |         int agg_hit;
 33 |         ~Cache_data(){
 34 |             // Dan: This might not be needed
 35 |             if (!embedding_value) {
 36 |                 free(embedding_value);
 37 |             }
 38 |         }
 39 |     };
 40 | 
 41 |     #define EV_DIMENSION 36
 42 |     #define N_EV_TABLE 26
 43 |     #define BIT_PRECISION_8BIT 8 // this could be 32bit, 16bit, 8bit, or 4bit
 44 |     int cap_C1 = -1, min_C1 = 0;
 45 |     unordered_map<string, Cache_data> vals_C1;
 46 |     vector<unordered_set<string>> lists_C1;
 47 | 
 48 |     int n_perfect_item_C1 = 0,
 49 |     max_perfect_item_C1 = 0;
 50 |     double flush_rate_C1 = 0.3,
 51 |     perfect_item_cap_C1 = 0.95,
 52 |     max_perfect_item_cap_C1 = 0;
 53 |     int TOTAL_BYTE_PER_ROW = EV_DIMENSION * BIT_PRECISION_8BIT / 8; // in Bytes
 54 |     int TOTAL_BYTE_PER_ITEM = -1; // in Bytes
 55 |     int TOTAL_CHUNK_PER_ROW = -1; // will be used to read the values
 56 | 
 57 |     vector<FILE*> files = vector<FILE*>(N_EV_TABLE);
 58 |     string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table-8/binary/";
 59 |     // string EV_TABLE_PATH = "/mnt/extra/ev-store-dlrm/stored_model/criteo_kaggle_all_mmap/epoch-00/ev-table/binary/";
 60 |     string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-0.5M/";
 61 |     // string WORKLOAD_PATH =  "/mnt/extra/binding-cython-cpp/epoll/evlfu_cpp/workload/Archive-new-1.0M/";
 62 | 
 63 |     // to handle the secondary caching layer (smaller precision)
 64 |     vector<string> shared_arr_group_keys = vector<string>(N_EV_TABLE);
 65 |     vector<bool> c2_arr_record_hit = vector<bool>(N_EV_TABLE);
 66 |     int c2_agg_hit;
 67 |     int c1_c2_agg_hit;
 68 |     EVLFU_4BIT *evlfu_4bit;
 69 |     APRX_EV *aprx_ev;
 70 |     int high_agghit_threshold = 23;
 71 | 
 72 |     // Multithreading
 73 |     #define N_THD__READ_EVTABLE_8BIT 3 // thread::hardware_concurrency(); // 48 threads
 74 |     struct Job_Reading_Evtable {
 75 |         int table_id;
 76 |         int row_id;
 77 |     };
 78 |     vector<thread> thds_pool__read_evtable;
 79 |     sem_t LOCK_WAITING_FOR_RESULT;
 80 |     sem_t LOCK_WAITING_OTHER_THDS;
 81 |     sem_t LOCK_THD_STATUSES[N_THD__READ_EVTABLE_8BIT];  // if the worker is available, the value will be 0
 82 |     vector<Job_Reading_Evtable> queue_job__read_evtable;
 83 |     vector<char*> global__arr_missing_values;
 84 |     int STOP_ALL_THREADS = 0;
 85 |     int secondary_precision = -1;
 86 | 
 87 | public:
 88 |     // multi layer caching 
 89 |     vector<Cache_data*> shared_arr_values_in_cache = vector<Cache_data*>(N_EV_TABLE);
 90 |     vector<char*> shared_arr_missing_values = vector<char*>(N_EV_TABLE);
 91 |     vector<string> shared__arr_evicted_keys;
 92 |     int aprx_ev_hit = 0;
 93 |     bool is_c3_active = false;
 94 |     vector<Cache_data> shared_arr_c3_aprx_ev = vector<Cache_data>(N_EV_TABLE);
 95 |   
 96 |     EVLFU_8BIT(int capacity, bool init_second_layer, int secondary_precision, int init_third_layer, string size_proportion);
 97 |     ~EVLFU_8BIT();
 98 | 
 99 |     // multithreading ev-reader
100 |     void init_thread_ev_reader();
101 |     void thd_loop_read_evtable(int thd_id);
102 |     void shutdown_evtable_reader_thds();
103 |     
104 |     void hello();
105 |     void check_size();
106 |     vector<string> split(const string& s, const string& delim);
107 |     void load_ev_tables();
108 |     void close_ev_tables();
109 |     void setKey(string& key, char *value, int agg_hit);
110 |     void update_agg_hit(Cache_data *value_in_cache, string& key, int agg_hit);
111 |     uint as_uint(const float x);
112 |     float as_float(const uint x);
113 |     void print_bits_short(const ushort x);
114 |     void print_bits(const float x);
115 |     float half_to_float(const ushort x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
116 |     ushort float_to_half(const float x); // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
117 |     float int_to_float(int value);
118 |     void print_ev_values(float *arr_floats);
119 |     void chars_buffer_to_floats(char *buffer, float * floats);
120 |     char* get_from_file(int table_id, int row_id);
121 |     int request_to_c1_c2(vector<int>& arr_row_ids, vector<bool>& c1_arr_record_hit, float * c1_c2_arr_emb_weights, int debug);
122 |     int request_to_c1_c2_c3(vector<int>& arr_row_ids, vector<bool>& c1_arr_record_hit, float * c1_c2_arr_emb_weights, int debug);
123 |     int request_to_ev_lfu(vector<int>& arr_row_ids, vector<bool>& arr_record_hit, vector<char *>& arr_emb_weights);
124 |     int phase_1_find_keys_in_cache(vector<string>& c2_arr_group_keys, vector<bool>& c2_arr_record_hit);
125 |     void phase_2_get_and_insert_missing_values(vector<string>& c2_arr_group_keys, vector<int>& arr_row_ids, 
126 |         vector<bool>& c2_arr_idx_to_insert, vector<bool>& c2_arr_idx_to_update, int c1_c2_agg_hit, float * c1_c2_arr_emb_weights, int debug);
127 |     vector<vector<int>> prepare_workload ();
128 |     char *request_by_key(string key);
129 |     char *find_approximate_ev(string key);
130 | };
131 | #endif


--------------------------------------------------------------------------------
/mixed_precs_caching/lib/.gitignore:
--------------------------------------------------------------------------------
1 | *.so


--------------------------------------------------------------------------------
/mixed_precs_caching/readme.txt:
--------------------------------------------------------------------------------
1 | Support mixed precision or single precision
2 | 4bit, 8bit, 16bit, 32bit


--------------------------------------------------------------------------------
/mixed_precs_caching/test.cpp:
--------------------------------------------------------------------------------
 1 | // CPP program to illustrate the
 2 | // unordered_set::erase() function
 3 | #include <iostream>
 4 | #include <string>
 5 | #include <list>
 6 | #include <unordered_map>
 7 | #include <unordered_set>
 8 | #include <vector>
 9 | #include <fstream>
10 | #include <stdio.h>
11 | #include <ctime>
12 | #include <unistd.h>
13 | #include <cstdlib>
14 | #include <cstring>
15 | #include <memory>
16 | #include <pthread.h>
17 | #include <thread>
18 | using namespace std;
19 | 
20 | int * test1(int *d){
21 |     int *i = d;
22 |     return i;
23 |     // return NULL;
24 | } 
25 | 
26 | // g++ -O3 test.cpp -pthread; ./a.out
27 | float cpy_float[30];
28 | 
29 | int main()
30 | {
31 |     string size_proportion = "10 200 34";
32 |     int index = size_proportion.find(' ');
33 |     int prop_c1 = stoi(size_proportion.substr(0,index));
34 |     size_proportion = size_proportion.substr(index + 1, size_proportion.size());
35 |     index = size_proportion.find(' ');
36 |     int prop_c2 = stoi(size_proportion.substr(0,index));
37 |     int prop_c3 = stoi(size_proportion.substr(index + 1, size_proportion.size()));
38 |     cout << prop_c1 << endl;
39 |     cout << prop_c2 << endl;
40 |     cout << prop_c3 << endl;
41 |     exit(-1);
42 | 
43 |     string ttt = "d";
44 | 
45 |     cout << ttt.empty() << endl;
46 |  
47 |     exit(-1);
48 | 
49 |     float real_float = 0.002345f;
50 |     float arr_floats[] = {0.002345f, 0.00023f};
51 |     float *fl;
52 |     fl = arr_floats;
53 |     
54 |     memcpy(cpy_float + 1, fl, 8);
55 |     printf("float real_float %f\n", real_float);
56 |     printf("float *fl %f\n", *fl);
57 |     printf("float cpy_float %f\n", cpy_float[0]);
58 |     printf("float cpy_float %f\n", cpy_float[1]);
59 | 
60 |     exit(-1);
61 |     int f = 4;  
62 |     int *x = test1(&f);
63 |     if (x != NULL)
64 |         printf("x %d\n", *x);
65 |     unordered_set<string> sampleSet = { "geeks1", "for", "geeks2", "ggeee" };
66 |     // unordered_map<string> sampleMap = { "geeks1":2, "geeks1":3, "geeks2":4, "ggeee":2 };
67 |  
68 |     // erases a particular element
69 |     sampleSet.erase("geeks1");
70 |  
71 |     // displaying the set after removal
72 |     cout << "Elements: ";
73 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
74 |         cout << *it << " ";
75 |     }
76 |  
77 |     sampleSet.insert("geeks1");
78 |     // erases from where for is
79 |     sampleSet.erase(sampleSet.find("for"), sampleSet.end());
80 |     string str = "sss";
81 |     // displaying the set after removal
82 |     cout << "\nAfter second removal set : \n";
83 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
84 |         printf("key_to_evict = %s\n", (*it).c_str());
85 |         str = *it;
86 |         // cout << *it << " ";
87 |     }
88 |     sampleSet.erase(str);
89 |  
90 |     cout << "\nAfter second removal set :\n";
91 |     for (auto it = sampleSet.begin(); it != sampleSet.end(); it++) {
92 |         printf("key_to_evict = %s\n", (*it).c_str());
93 |     }
94 | 
95 |     float* missing_value = (float*) malloc(sizeof(float) * 36);
96 |     missing_value[0] = 1.23;
97 |     printf("test %f\n", missing_value[0]);
98 |     return 0;
99 | }


--------------------------------------------------------------------------------
/mixed_precs_caching/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import numpy.ctypeslib as ctl
 3 | import ctypes
 4 | import time
 5 | N_EVTable = 26
 6 | 
 7 | libdir = './lib/'
 8 | 
 9 | cache_manager_cpp = ctl.load_library('libcachemanager.so', libdir)
10 | 
11 | arr =  [1,2,3,4,5]
12 | arr_c = (ctypes.c_int * 5)(*arr)
13 | 
14 | cache_manager_cpp.test_arr.argtypes = [ctypes.POINTER(ctypes.c_int)]
15 | cache_manager_cpp.test_arr.restype = None
16 | cache_manager_cpp.test_arr(arr_c)
17 | 
18 | # arr_rows_id =  [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
19 | # arr_rows_id =  [1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
20 | arr_rows_id = [3, 127, 294, 254, 0, 4, 1519, 0, 0, 3, 473, 286, 280, 2, 1073, 275, 6, 203, 0, 1, 280, 0, 1, 199, 4, 43]
21 | arr_rows_id_c = (ctypes.c_int * N_EVTable)(* arr_rows_id)
22 | 
23 | 
24 | # 1. Do EV-Lookup, check the perfect-hit 
25 | cache_manager_cpp.ev_lookup_based_on_list_keys.argtypes = [ctypes.POINTER(ctypes.c_int)]
26 | cache_manager_cpp.ev_lookup_based_on_list_keys.restype = ctypes.c_int
27 | is_perfect_hit = cache_manager_cpp.ev_lookup_based_on_list_keys(arr_rows_id_c)
28 | print(is_perfect_hit)
29 | is_perfect_hit = cache_manager_cpp.ev_lookup_based_on_list_keys(arr_rows_id_c)
30 | print(is_perfect_hit)
31 | 
32 | # 2. Get the EV-values as a long bytes of floats
33 | cache_manager_cpp.get_ev_values.argtypes = None
34 | cache_manager_cpp.get_ev_values.restype = ctypes.POINTER(ctypes.c_float)
35 | ev_values = cache_manager_cpp.get_ev_values()
36 | print(ev_values[0:36])
37 | 
38 | # for idx in range (0, N_EVTable):
39 |     # print(ev_values[idx])
40 | 
41 | # This is to start the socket server
42 | # cache_manager_cpp.init_global_vars.argtypes = []
43 | # cache_manager_cpp.init_global_vars()
44 | 
45 | # py_start_server_threads = lib.start_server_threads
46 | # py_start_server_threads.argtypes = []
47 | # py_start_server_threads()
48 | # time.sleep(10)
49 | 
50 | 
51 | 
52 | # # calling libcmult.so::cmult(int int_param, float float_param)
53 | # py_cmult = lib2.cmult
54 | # py_cmult.argtypes = [ctypes.c_int, ctypes.c_float]
55 | # results = py_cmult(2, 3.3)
56 | # print("cmult = " + str(results))
57 | 
58 | # # calling liblibrary.so::print_value(int x)
59 | # py_print_value = lib3.print_value
60 | # py_print_value.argtypes = [ctypes.c_int]
61 | # py_print_value(2)
62 | # print("py_print_value done ")
63 | 
64 | 


--------------------------------------------------------------------------------
/mlperf_logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | """
  8 | Utilities for MLPerf logging
  9 | """
 10 | import os
 11 | import torch
 12 | 
 13 | try:
 14 |     from mlperf_logging import mllog
 15 |     from mlperf_logging.mllog import constants
 16 |     _MLLOGGER = mllog.get_mllogger()
 17 | except ImportError as error:
 18 |         print("Unable to import mlperf_logging, ", error)
 19 | 
 20 | 
 21 | def log_start(*args, **kwargs):
 22 |     "log with start tag"
 23 |     _log_print(_MLLOGGER.start, *args, **kwargs)
 24 | 
 25 | 
 26 | def log_end(*args, **kwargs):
 27 |     "log with end tag"
 28 |     _log_print(_MLLOGGER.end, *args, **kwargs)
 29 | 
 30 | 
 31 | def log_event(*args, **kwargs):
 32 |     "log with event tag"
 33 |     _log_print(_MLLOGGER.event, *args, **kwargs)
 34 | 
 35 | 
 36 | def _log_print(logger, *args, **kwargs):
 37 |     "makes mlperf logger aware of distributed execution"
 38 |     if 'stack_offset' not in kwargs:
 39 |         kwargs['stack_offset'] = 3
 40 |     if 'value' not in kwargs:
 41 |         kwargs['value'] = None
 42 | 
 43 |     if kwargs.pop('log_all_ranks', False):
 44 |         log = True
 45 |     else:
 46 |         log = (get_rank() == 0)
 47 | 
 48 |     if log:
 49 |         logger(*args, **kwargs)
 50 | 
 51 | 
 52 | def config_logger(benchmark):
 53 |     "initiates mlperf logger"
 54 |     mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
 55 |     _MLLOGGER.logger.propagate = False
 56 | 
 57 | 
 58 | def barrier():
 59 |     """
 60 |     Works as a temporary distributed barrier, currently pytorch
 61 |     doesn't implement barrier for NCCL backend.
 62 |     Calls all_reduce on dummy tensor and synchronizes with GPU.
 63 |     """
 64 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 65 |         torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
 66 |         torch.cuda.synchronize()
 67 | 
 68 | 
 69 | def get_rank():
 70 |     """
 71 |     Gets distributed rank or returns zero if distributed is not initialized.
 72 |     """
 73 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 74 |         rank = torch.distributed.get_rank()
 75 |     else:
 76 |         rank = 0
 77 |     return rank
 78 | 
 79 | 
 80 | def mlperf_submission_log(benchmark):
 81 |     """
 82 |     Logs information needed for MLPerf submission
 83 |     """
 84 | 
 85 |     config_logger(benchmark)
 86 | 
 87 |     log_event(
 88 |         key=constants.SUBMISSION_BENCHMARK,
 89 |         value=benchmark,
 90 |         )
 91 | 
 92 |     log_event(
 93 |         key=constants.SUBMISSION_ORG,
 94 |         value='reference_implementation')
 95 | 
 96 |     log_event(
 97 |         key=constants.SUBMISSION_DIVISION,
 98 |         value='closed')
 99 | 
100 |     log_event(
101 |         key=constants.SUBMISSION_STATUS,
102 |         value='onprem')
103 | 
104 |     log_event(
105 |         key=constants.SUBMISSION_PLATFORM,
106 |         value='reference_implementation')
107 | 
108 |     log_event(
109 |         key=constants.SUBMISSION_ENTRY,
110 |         value="reference_implementation")
111 | 
112 |     log_event(
113 |         key=constants.SUBMISSION_POC_NAME,
114 |         value='reference_implementation')
115 | 
116 |     log_event(
117 |         key=constants.SUBMISSION_POC_EMAIL,
118 |         value='reference_implementation')
119 | 


--------------------------------------------------------------------------------
/optim/rwsadagrad.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import torch
  8 | from torch.optim import Optimizer
  9 | 
 10 | 
 11 | class RWSAdagrad(Optimizer):
 12 |     """Implements Row Wise Sparse Adagrad algorithm.
 13 | 
 14 |     Arguments:
 15 |         params (iterable): iterable of parameters to optimize or dicts defining
 16 |             parameter groups
 17 |         lr (float, optional): learning rate (default: 1e-2)
 18 |         lr_decay (float, optional): learning rate decay (default: 0)
 19 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 20 |         eps (float, optional): term added to the denominator to improve
 21 |             numerical stability (default: 1e-10)
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, params, lr=1e-2, lr_decay=0.0, weight_decay=0.0, initial_accumulator_value=0.0, eps=1e-10):
 26 |         if not 0.0 <= lr:
 27 |             raise ValueError("Invalid learning rate: {}".format(lr))
 28 |         if not 0.0 <= lr_decay:
 29 |             raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
 30 |         if not 0.0 <= weight_decay:
 31 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 32 |         if not 0.0 <= initial_accumulator_value:
 33 |             raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
 34 |         if not 0.0 <= eps:
 35 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 36 | 
 37 |         self.defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay,
 38 |                         initial_accumulator_value=initial_accumulator_value)
 39 |         super(RWSAdagrad, self).__init__(params, self.defaults)
 40 | 
 41 |         self.momentum_initialized = False
 42 | 
 43 |         for group in self.param_groups:
 44 |             for p in group['params']:
 45 |                 self.state[p]['step'] = 0
 46 | 
 47 |     def share_memory(self):
 48 |         for group in self.param_groups:
 49 |             for p in group['params']:
 50 |                 state = self.state[p]
 51 |                 if p.grad.data.is_sparse:
 52 |                     state['momentum'].share_memory_()
 53 |                 else:
 54 |                     state['sum'].share_memory_()
 55 | 
 56 |     def step(self, closure=None):
 57 |         """Performs a single optimization step.
 58 | 
 59 |         Arguments:
 60 |             closure (callable, optional): A closure that reevaluates the model
 61 |                 and returns the loss.
 62 |         """
 63 |         loss = None
 64 |         if closure is not None:
 65 |             loss = closure()
 66 | 
 67 |         for group in self.param_groups:
 68 |             for p in group['params']:
 69 |                 if p.grad is None:
 70 |                     continue
 71 | 
 72 |                 if not self.momentum_initialized :
 73 |                     if p.grad.data.is_sparse:
 74 |                         self.state[p]['momentum'] = torch.full(
 75 |                             [p.data.shape[0]],
 76 |                             self.defaults["initial_accumulator_value"],
 77 |                             dtype=torch.float32,
 78 |                         )
 79 |                     else:
 80 |                         self.state[p]['sum'] = torch.full_like(p.data,
 81 |                             self.defaults["initial_accumulator_value"],
 82 |                             dtype=torch.float32,
 83 |                         )
 84 | 
 85 |                 grad = p.grad
 86 |                 state = self.state[p]
 87 | 
 88 |                 state['step'] += 1
 89 | 
 90 |                 if group['weight_decay'] != 0:
 91 |                     if p.grad.data.is_sparse:
 92 |                         raise RuntimeError("weight_decay option is not compatible with sparse gradients")
 93 |                     grad = grad.add(group['weight_decay'], p.data)
 94 | 
 95 |                 clr = group['lr'] / (1.0 + (state['step'] - 1.0) * group['lr_decay'])
 96 | 
 97 |                 if grad.is_sparse:
 98 |                     grad = grad.coalesce()  # the update is non-linear so indices must be unique
 99 |                     grad_indices = grad._indices()
100 |                     grad_values = grad._values()
101 |                     size = grad.size()
102 | 
103 |                     def make_sparse(values, row_wise):
104 |                         constructor = grad.new
105 |                         matrix_size = [size[0]] if row_wise else size
106 |                         return constructor(grad_indices, values, matrix_size)
107 | 
108 |                     if grad_values.numel() > 0:
109 |                         momentum_update = make_sparse(grad_values.pow(2).mean(dim=1), True)
110 |                         state['momentum'].add_(momentum_update)  # update momentum
111 |                         std = state['momentum'].sparse_mask(momentum_update.coalesce())
112 |                         std_values = std._values().sqrt_().add_(group['eps'])
113 |                         p.data.add_(make_sparse(grad_values / std_values.view(std_values.size()[0], 1), False), alpha=-clr)
114 | 
115 |                 else:
116 |                     state['sum'].addcmul_(grad, grad, value=1.0)
117 |                     std = state['sum'].sqrt().add_(group['eps'])
118 |                     p.data.addcdiv_(grad, std, value=-clr)
119 | 
120 |         self.momentum_initialized = True
121 | 
122 |         return loss
123 | 


--------------------------------------------------------------------------------
/script/approximate_embedding/phase2_similarity_analysis/README.txt:
--------------------------------------------------------------------------------
 1 | How to run cuML_GPU:
 2 | 1. Make sure your pc has nvidia gpu in it
 3 | 2. Make sure your cuda version is 11++
 4 | 3. Install rapids AI using conda
 5 |     # Make sure your python version in conda environment is 3.8 or 3.9 (this is the supported version at the time of writing)
 6 |     conda config --add channels rapidsai nvidia conda-forge
 7 |     conda config --set channel_priority flexible
 8 |     conda install rapids=22.04 cudatoolkit=11.2 dask-sql
 9 |     # The above command can be modified depending on what version you want to use. More info at https://rapids.ai/start.html#get-rapids
10 | 


--------------------------------------------------------------------------------
/script/approximate_embedding/phase2_similarity_analysis/csvReader.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import numpy as np
 3 | 
 4 | def read_table(table_count):
 5 |     fileNames = []
 6 |     for i in range(1, table_count+1):
 7 |         fileNames.append("../../../stored_model/criteo_kaggle_all/epoch-00/ev-table/"+"ev-table-" + str(i) + ".csv")
 8 | 
 9 |     arrOfNpArr = []
10 |     arrOfLen = []
11 |     for name in fileNames:
12 |         print("Processing", name)
13 |         with open(name) as csv_file:
14 |             csv_reader = csv.reader(csv_file, delimiter=',')
15 |             line_count = 0
16 |             for row in csv_reader:
17 |                 if line_count != 0:
18 |                     temp = []
19 |                     for col in row:
20 |                         temp.append(float(col))
21 |                     arrOfNpArr.append(temp)
22 | 
23 |                 line_count += 1
24 |             arrOfLen.append(line_count-1)
25 | 
26 |     arrOfNpArr = np.array(arrOfNpArr)
27 |     print(arrOfNpArr.shape)
28 |     return arrOfNpArr, arrOfLen


--------------------------------------------------------------------------------
/script/approximate_embedding/phase2_similarity_analysis/get_neighbors_CPU_slow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from csvReader import read_table\n",
 10 |     "\n",
 11 |     "arrOfNpArr, arrOfLen = read_table(26)"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "cumsum = arrOfLen.copy()\n",
 21 |     "for i in range(1, len(cumsum)):\n",
 22 |     "    cumsum[i] += cumsum[i-1]\n",
 23 |     "\n",
 24 |     "print(arrOfLen)\n",
 25 |     "print(cumsum)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn.neighbors import NearestNeighbors"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "nbrs = NearestNeighbors(n_neighbors=11, algorithm='ball_tree', n_jobs=-1).fit(arrOfNpArr)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "def tableRow(x):\n",
 53 |     "    for i in range(len(cumsum)):\n",
 54 |     "        if cumsum[i]>x:\n",
 55 |     "            if i>0:\n",
 56 |     "                return i+1,x-cumsum[i-1]\n",
 57 |     "            else:\n",
 58 |     "                return 1, x\n",
 59 |     "    return len(cumsum), x-cumsum[-1]"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "for i in range(len(arrOfLen)):\n",
 69 |     "    csvName =\"./\"+\"sklearn/\" +\"ev-table-\"+str(i+1) +\".csv\"\n",
 70 |     "    csvfile = open(csvName, \"w\")\n",
 71 |     "    start = 0 if i==0 else cumsum[i-1]\n",
 72 |     "    indices = nbrs.kneighbors(arrOfNpArr[start:cumsum[i]], return_distance=False)\n",
 73 |     "    print(\"Writing table\", i+1)\n",
 74 |     "    for j in range(len(indices)):\n",
 75 |     "        ta,ra = tableRow(indices[j][0])\n",
 76 |     "        t1,r1 = tableRow(indices[j][1])\n",
 77 |     "        t2,r2 = tableRow(indices[j][2])\n",
 78 |     "        t3,r3 = tableRow(indices[j][3])\n",
 79 |     "        t4,r4 = tableRow(indices[j][4])\n",
 80 |     "        t5,r5 = tableRow(indices[j][5])\n",
 81 |     "        t6,r6 = tableRow(indices[j][6])\n",
 82 |     "        t7,r7 = tableRow(indices[j][7])\n",
 83 |     "        t8,r8 = tableRow(indices[j][8])\n",
 84 |     "        t9,r9 = tableRow(indices[j][9])\n",
 85 |     "        t10,r10 = tableRow(indices[j][10])\n",
 86 |     "        csvfile.write(str(t1)+\"-\"+str(r1)+\",\"+str(t2)+\"-\"+str(r2)+\",\"+str(t3)+\"-\"+str(r3)+\",\"+str(t4)+\"-\"+str(r4)+\",\"+str(t5)+\"-\"+str(r5)+\",\"+str(t6)+\"-\"+str(r6)+\",\"+str(t7)+\"-\"+str(r7)+\",\"+str(t8)+\"-\"+str(r8)+\",\"+str(t9)+\"-\"+str(r9)+\",\"+str(t10)+\"-\"+str(r10)+\"\\n\")\n",
 87 |     "    csvfile.close()"
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "interpreter": {
 93 |    "hash": "72058e4688fcceb4d5ed82b60d0f805f4f94c9a6644c87830f9cd7ee14abc142"
 94 |   },
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3.9.12 ('ev-store-pyrocks-env')",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.9.12"
111 |   },
112 |   "orig_nbformat": 4
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/script/approximate_embedding/phase2_similarity_analysis/most_popular_neighbor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import csv\n",
 10 |     "import numpy as np\n",
 11 |     "\n",
 12 |     "rankDict = {}\n",
 13 |     "with open(\"rankedWorkload.csv\") as csv_file:\n",
 14 |     "    csv_reader = csv.reader(csv_file, delimiter=',')\n",
 15 |     "    line_count = 0\n",
 16 |     "    for row in csv_reader:\n",
 17 |     "        if line_count != 0:\n",
 18 |     "            rankDict[row[0]]=int(row[1])\n",
 19 |     "        line_count+=1"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stdout",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "Processing ./cuml/ev-table-1.csv\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "# Use Dict\n",
 37 |     "fileNames = []\n",
 38 |     "resName = []\n",
 39 |     "for i in range(1, 27):\n",
 40 |     "    fileNames.append(\"./cuml/\"+\"ev-table-\" + str(i) + \".csv\")\n",
 41 |     "    resName.append(\"./popular/\"+\"ev-table-\" + str(i) + \".csv\")\n",
 42 |     "\n",
 43 |     "for i in range(0, 26):\n",
 44 |     "    print(\"Processing\", fileNames[i])\n",
 45 |     "    csv_file = open(fileNames[i])\n",
 46 |     "    res_file = open(resName[i], \"w\")\n",
 47 |     "    csv_reader = csv.reader(csv_file, delimiter=',')\n",
 48 |     "    for row in csv_reader:\n",
 49 |     "        freqArr = []\n",
 50 |     "        \n",
 51 |     "        for col in row:\n",
 52 |     "            frequency = rankDict.get(col)\n",
 53 |     "            if frequency is None:\n",
 54 |     "                freqArr.append(0)\n",
 55 |     "            else:\n",
 56 |     "                freqArr.append(frequency)\n",
 57 |     "        \n",
 58 |     "        max_value = max(freqArr)\n",
 59 |     "        max_idx = freqArr.index(max_value)\n",
 60 |     "        res_file.write(str(row[max_idx])+\"\\n\")\n",
 61 |     "            \n",
 62 |     "    csv_file.close()\n",
 63 |     "    res_file.close()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": []
 72 |   }
 73 |  ],
 74 |  "metadata": {
 75 |   "interpreter": {
 76 |    "hash": "72058e4688fcceb4d5ed82b60d0f805f4f94c9a6644c87830f9cd7ee14abc142"
 77 |   },
 78 |   "kernelspec": {
 79 |    "display_name": "Python 3.9.12 ('ev-store-pyrocks-env')",
 80 |    "language": "python",
 81 |    "name": "python3"
 82 |   },
 83 |   "language_info": {
 84 |    "codemirror_mode": {
 85 |     "name": "ipython",
 86 |     "version": 3
 87 |    },
 88 |    "file_extension": ".py",
 89 |    "mimetype": "text/x-python",
 90 |    "name": "python",
 91 |    "nbconvert_exporter": "python",
 92 |    "pygments_lexer": "ipython3",
 93 |    "version": "3.9.12"
 94 |   },
 95 |   "orig_nbformat": 4
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 2
 99 | }
100 | 


--------------------------------------------------------------------------------
/script/compress_folder_for_github.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | LIST_FILES=compressed4git-list-files.txt
 3 | MAIN_DIR=`pwd`
 4 | SIZE=45M #MB
 5 | # remote: warning: File input/compressed4git-criteo_kaggle_5mil/criteo_kaggle_5mil.npz.tar.gz.partaa 
 6 | # is 95.00 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB
 7 | 
 8 | if [ "$#" -ne 1 ]; then
 9 |     echo "ERROR: Need 1 arguments: <input_dir>"
10 |     echo "       Example: ./script/compress_folder_for_github.sh  stored_model/criteo_kaggle_5mil/epoch-0/ev-table"
11 |     exit
12 | fi
13 | 
14 | echo "==========================================="
15 | echo "Compressing the files inside folder = $1"
16 | echo "    will NOT compress any inner folder!"
17 | echo "==========================================="
18 | dirname=`basename $1`
19 | parentdir="$(dirname "$1")"
20 | # create the new folder to hold the compressed files
21 | new_folder=$parentdir/"compressed4git-$dirname"
22 | index_file=$parentdir/"compressed4git-$dirname"/$LIST_FILES
23 | rm -rf $new_folder; mkdir $new_folder
24 | echo "-- this line will be removed by tail later" >  $index_file
25 | 
26 | # collect the files that will be compressed
27 | for file in $1/*; do
28 |     if [[ -d $file ]]; then
29 |         echo "== $file is a directory, will not be compressed"
30 |     else
31 |         echo "Found a file to compress : $file"
32 |         filename=`basename $file`
33 |         if [[ $filename == *"tar.gz"* ]]; then
34 |           # we can't handle the file that already compressed as tar.gz; 
35 |           # this will complicate the script
36 |           echo "ERROR: *tar.gz* file is FOUND!"
37 |           echo "       please remove it with = "
38 |           echo "           rm -rf $1/*tar.gz*"
39 |           exit
40 |         fi
41 |         echo $filename >>  $parentdir/"compressed4git-$dirname"/$LIST_FILES
42 |     fi
43 | done
44 | 
45 | # remove the first line
46 | tail -n +2 "$index_file"  > "$index_file.tmp" && mv "$index_file.tmp" "$index_file"
47 | 
48 | cat $index_file | while read file 
49 | do
50 |   cd $1 #because we want the compression to happen at the file, without any dirs
51 |   echo "Compressing file = $file "
52 |   # compress the file using tar
53 |   tar -zcvf $file.tar.gz $file
54 |   
55 |   # split into 45 MB each
56 |   split -b $SIZE $file.tar.gz "$file.tar.gz.part"
57 | 
58 |   # remove the plain tar.gz
59 |   rm -rf $file.tar.gz
60 | 
61 |   cd $MAIN_DIR #because the moving is relative to where the script was executed
62 |   mv $1/*tar.gz.part* $new_folder/
63 | done
64 | 
65 | echo "Done, folder is ready for Github push: $new_folder"
66 | 


--------------------------------------------------------------------------------
/script/convert_altkeys_to_binary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import argparse
 4 | import os
 5 | import pandas as pd 
 6 | import numpy as np
 7 | from tqdm import tqdm
 8 | from pathlib import Path
 9 | from os import listdir
10 | import struct
11 | 
12 | 
13 | OUT_BINARY_DIR_NAME = "binary/"
14 | 
15 | def get_files_in_dir(baseDir, keyword):
16 |     listPath = []
17 |     counter = 0
18 |     for item in listdir(baseDir):
19 |         if (os.path.isfile(os.path.join(baseDir, item))) and (keyword in item):
20 |             counter += 1
21 |             listPath.append(os.path.join(baseDir, item))
22 |     if counter == 0:
23 |         print("ERROR: Can't find files (*"+keyword+"*) in folder: " + baseDir)
24 |         exit(-1)
25 |     return listPath
26 | 
27 | def write_as_binary(arr_values, output_file):
28 |     # https://stackoverflow.com/questions/27238680/writing-integers-in-binary-to-file-in-python
29 |     # bytearray
30 |     with open(output_file, 'wb') as f:
31 |         # per row
32 |         for val in arr_values:
33 |             # https://stackoverflow.com/questions/58113899/creating-only-one-byte-with-struct-pack
34 |             # Code for Unisgned INT: https://docs.python.org/3/library/struct.html
35 |             f.write(struct.pack('>I', val))
36 |             # f.write(val.to_bytes(4, byteorder='big', signed=False)) # integer is 4Bytes
37 |     f.close()
38 |     print("===== output file : " + output_file)
39 |     # exit()
40 | 
41 | def convert_altkeys_to_binary(input_file, output_file):
42 |     df = pd.read_csv(input_file, dtype=object, delimiter='-', header=None)
43 |     df.rename(columns={0: 'tableId', 1: 'rowId'}, inplace=True)
44 | 
45 |     #combine the tableId and rowId
46 |     # XXXXXXXXXXYY the last 2 digits is for tableId
47 |         # YY = tableId 
48 |         # XXXXXXXXXX = the rowId
49 | 
50 |     df["altKey"] = df.apply(lambda row: int(row['tableId']) + 100 * int(row["rowId"]), axis = 1)
51 |     # print(df.head)
52 | 
53 |     # All python integers are long under the hood (https://stackoverflow.com/questions/34247166/python-convert-int-to-unsigned-short-then-back-to-int)
54 |     arr_alt_keys = df["altKey"].tolist()
55 |     # print(arr_alt_keys[0])
56 |     # print(type(arr_alt_keys[0]))
57 |     write_as_binary(arr_alt_keys, output_file)
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument("-input_folder", help="Folder path of the raw alternative-keys files",type=str)
62 |     args = parser.parse_args()
63 |     if (not args.input_folder):
64 |         print("ERROR: You must provide 1 argument: -input_folder <the input folder> ")
65 |         exit(-1)
66 |     else:
67 |         # Create output directory at the PARENT dir
68 |         input_folder_path = args.input_folder
69 |         arr_files = get_files_in_dir(input_folder_path, "ev-table")
70 |         Path(os.path.join(input_folder_path, OUT_BINARY_DIR_NAME)).mkdir(parents=True, exist_ok=True)
71 | 
72 |         # process each file
73 |         for input_filepath in arr_files:
74 |             print("Processing ... " + input_filepath)
75 |             fileName = os.path.basename(input_filepath)
76 |             fileName = os.path.splitext(fileName)[0] # removing .csv extension
77 |             output_filepath = os.path.join(input_folder_path, OUT_BINARY_DIR_NAME, fileName + ".bin")
78 | 
79 |             # Convert each line in this file
80 |             convert_altkeys_to_binary(input_filepath, output_filepath)
81 |             # print(output_filepath)
82 |             # exit(-1)
83 |         print("output folder : " + input_folder_path + "/binary/")
84 |         print("Done")
85 | 


--------------------------------------------------------------------------------
/script/convert_ev_to_binary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | import argparse
  4 | import os
  5 | import pandas as pd 
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from pathlib import Path
  9 | from os import listdir
 10 | import struct
 11 | 
 12 | OUT_BINARY_DIR_NAME = "binary/"
 13 | 
 14 | def get_files_in_dir(baseDir, keyword):
 15 |     listPath = []
 16 |     counter = 0
 17 |     for item in listdir(baseDir):
 18 |         if (os.path.isfile(os.path.join(baseDir, item))) and (keyword in item):
 19 |             counter += 1
 20 |             listPath.append(os.path.join(baseDir, item))
 21 |     if counter == 0:
 22 |         print("ERROR: Can't find files (*"+keyword+"*) in folder: " + baseDir)
 23 |         exit(-1)
 24 |     return listPath
 25 | 
 26 | # save to a file
 27 | def write_to_file(df, filePath):
 28 |     df.to_csv(filePath, index=False, header=True, sep=',')
 29 |     print("===== output file : " + filePath)
 30 | 
 31 | def write_uchar_per_byte(df, filePath):
 32 |     cols = df.columns
 33 |     with open(filePath, 'wb') as f:
 34 |         # iterate per string (row)
 35 |         for index, row in df.iterrows():
 36 |             # iterate per char
 37 |             for col in cols:
 38 |                 # convert to unsign char MUST use struct.pack!!
 39 |                 # https://stackoverflow.com/questions/58113899/creating-only-one-byte-with-struct-pack
 40 |                 f.write(struct.pack('>B', row[col]))
 41 |     print("===== output file : " + filePath)
 42 | 
 43 | def write_ushort_per_byte(df, filePath):
 44 |     cols = df.columns
 45 |     with open(filePath, 'wb') as f:
 46 |         # iterate per string (row)
 47 |         for index, row in df.iterrows():
 48 |             # iterate per char
 49 |             for col in cols:
 50 |                 # convert to unsign char MUST use struct.pack!!
 51 |                 # https://stackoverflow.com/questions/58113899/creating-only-one-byte-with-struct-pack
 52 |                 f.write(struct.pack('H', row[col]))
 53 |                 # print(row[col])
 54 |                 # print(struct.unpack('H', struct.pack('H', row[col])))
 55 |                 # exit(-1)
 56 |     print("===== output file : " + filePath)
 57 | 
 58 | def write_as_binary(df, filePath):
 59 |     # bytearray
 60 |     columns = df.columns[:-1]
 61 |     with open(filePath, 'wb') as f:
 62 |         # per row
 63 |         for nrow in tqdm(range(0, df.shape[0])):
 64 |             # per column [0... 1023]
 65 |             for col in columns:
 66 |                 val = df[col].iloc[nrow]
 67 |                 f.write(val)
 68 |     f.close()
 69 |     print("===== output file : " + filePath)
 70 | 
 71 | if __name__ == '__main__':
 72 |     parser = argparse.ArgumentParser()
 73 |     parser.add_argument("-file", help="File path of the raw ev data",type=str)
 74 |     parser.add_argument("-read_as", help="the ev data can be read as fp32 or fp16",type=str)
 75 |     args = parser.parse_args()
 76 |     if (not args.file) or (not args.read_as):
 77 |         print("ERROR: You must provide these 2 arguments: -file <the input file> -read_as <read as fp32 or fp16>  ")
 78 |         exit(-1)
 79 |     else:
 80 |         # Create output directory at the PARENT dir
 81 |         filePath = args.file
 82 |         parentDir = str(Path(filePath).parent)
 83 |         fileName = os.path.basename(args.file)
 84 |         fileName = os.path.splitext(fileName)[0] # removing .csv extension
 85 |         outFile = os.path.join(parentDir, OUT_BINARY_DIR_NAME, fileName + ".bin")
 86 | 
 87 |         print("===== Read ev data as " + args.read_as)
 88 |         if (args.read_as == "fp32"):
 89 |             # create output folder if it doesn't exist
 90 |             Path(os.path.join(parentDir, OUT_BINARY_DIR_NAME)).mkdir(parents=True, exist_ok=True)
 91 |         
 92 |             df = pd.read_csv(filePath, dtype=object, delimiter=',').astype(np.float32)
 93 |             if 'key' not in df.columns:
 94 |                 # No Key column, just use the index
 95 |                 df["key"] = df.index
 96 |             df["key"] = df.key.astype(int)
 97 |             write_as_binary(df, outFile)
 98 | 
 99 |         elif (args.read_as == "u_short"):
100 |             # alternative method to improve the fp16 by np.float16
101 |             # New method: Use u_short to hold the value
102 |             # This works for 8bit and 4bit precision
103 |             grandParentDir = str(Path(parentDir).parent)
104 |             # create output folder if it doesn't exist
105 |             Path(os.path.join(grandParentDir, OUT_BINARY_DIR_NAME)).mkdir(parents=True, exist_ok=True)
106 | 
107 |             df = pd.read_csv(filePath, dtype=object, delimiter=',').astype(np.int)
108 |             # print(df.head())
109 |             outFile = os.path.join(grandParentDir, OUT_BINARY_DIR_NAME, fileName + ".bin")
110 |             write_ushort_per_byte(df, outFile)
111 |             
112 |         elif (args.read_as == "fp16"):
113 |             print("ERROR: read as fp16 is no longer supported! It's too slow to unpack by C++")
114 |             exit(-1)
115 |             # create output folder if it doesn't exist
116 |             Path(os.path.join(parentDir, OUT_BINARY_DIR_NAME)).mkdir(parents=True, exist_ok=True)
117 |         
118 |             # Slow to unpack in c++!!! 
119 |             # df = pd.read_csv(filePath, dtype=object, delimiter=',').astype(np.float16)
120 | 
121 |             if 'key' not in df.columns:
122 |                 # No Key column, just use the index
123 |                 df["key"] = df.index
124 |             df["key"] = df.key.astype(int)
125 |             write_as_binary(df, outFile)
126 |         elif (args.read_as == "u_char"):
127 |             # This works for 8bit and 4bit precision
128 |             grandParentDir = str(Path(parentDir).parent)
129 |             # create output folder if it doesn't exist
130 |             Path(os.path.join(grandParentDir, OUT_BINARY_DIR_NAME)).mkdir(parents=True, exist_ok=True)
131 |         
132 |             df = pd.read_csv(filePath, dtype=object, delimiter=',').astype(np.int)
133 |             outFile = os.path.join(grandParentDir, OUT_BINARY_DIR_NAME, fileName + ".bin")
134 | 
135 |             write_uchar_per_byte(df, outFile)
136 |            
137 |         else:
138 |             print("ERROR: Can't understand the read_as format : " + args.read_as)
139 |             exit(-1)
140 |         print("Done")
141 | 


--------------------------------------------------------------------------------
/script/dissectingmodel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import csv
 3 | import pandas as pd
 4 | import os
 5 | import numpy as np
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser(
 9 |         description="Truncating weights of embedding layers"
10 |     )
11 | 
12 | parser.add_argument("--num-of-dp", type=int, default=0)
13 | 
14 | global args
15 | args = parser.parse_args()
16 | 
17 | num_of_dp = num_of_dp
18 | 
19 | for ev_idx in range(0, 26):
20 |     # Read new EV Table from file
21 |     new_ev_path = os.path.join("weights_and_biases/epoch-1/ev-table", "ev-table-" + str(ev_idx + 1) + ".csv")
22 |     new_ev_df = pd.read_csv(new_ev_path, dtype=float, delimiter=',')
23 |     
24 |     # print(new_ev_df.head(3))
25 |     
26 |     # convert all values to specific number of decimals 
27 |     new_ev_df = new_ev_df.round(num_of_dp)
28 |     
29 |     # create new path based on version
30 |     new_ev_path_mod = os.path.join("weights_and_biases/epoch-1/ev-table", "ev-table-ver-" + str(num_of_dp) + "d.p.-" + str(ev_idx + 1) + ".csv")
31 |     
32 |     # save to new csv file
33 |     new_ev_df.to_csv(path_or_buf=new_ev_path_mod, index=False)
34 |     
35 | #     new_ev_df = pd.read_csv(new_ev_path_mod, dtype=float, delimiter=',')
36 |     
37 | #     print(new_ev_df.head(3))
38 |             
39 | #     print(new_ev_arr)
40 | 
41 |     
42 | 
43 | # with open('weights_and_biases/test.csv', 'r') as f:    
44 | #     reader_csv = csv.reader(f)
45 | #     for row in reader_csv:
46 | #         arr = list(row)
47 | #         print(row)
48 | #     print("done")


--------------------------------------------------------------------------------
/script/free_page_cache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | duration=$1 # in seconds
 4 | echo "Clearing page_cache per $duration s. Press [CTRL+C] to stop.."
 5 | 
 6 | while :
 7 | do
 8 |   sync; echo 1 > /proc/sys/vm/drop_caches
 9 | 	sleep $duration
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/script/gnuplot_cdf_direct_io.plt:
--------------------------------------------------------------------------------
 1 | #!/bin/gnuplot --persist
 2 | 
 3 | # Sample run: gnuplot -e "input_dir='/mnt/extra/ev-store-dlrm/logs/inference=0.0022/use-evstore=True/extra-mem-load=8500/use-emb-cache=True/ev-lookup-only=True/cache-size=230000/emb-stor=sqlite/'" /mnt/extra/ev-store-dlrm/script/gnuplot_cdf_evlfu_lru.plt
 4 | # Make sure in that input_dir, you have evlfu-cdf.csv and lru-cdf.csv
 5 | 
 6 | print "input_dir  : ", input_dir
 7 | 
 8 | file1=input_dir."/cpp_algo-cdf.csv"
 9 | file2=input_dir."/8bit_direct_io-cdf.csv"
10 | file3=input_dir."/4bit_8bit_direct_io-cdf.csv"
11 | file4=input_dir."/no-drop-caches.csv"
12 | 
13 | # set terminal qt 1
14 | # set terminal postscript eps enhanced color 22 font ",19"
15 | # outputFile="cdf_evlfu_vs_lru.eps"
16 | 
17 | set terminal pdf font ",12"
18 | # set terminal pdf font ",21"
19 | outputFile=input_dir."/cdf_direct_io_analysis.pdf"
20 | 
21 | 
22 | # set term png
23 | set size 1,1
24 | 
25 | set style line 1 linecolor rgb "red" lt 1 lw 4
26 | set style line 2 linecolor rgb "blue" lt 1 lw 4
27 | set style line 3 linecolor rgb "green" lt 2 lw 4
28 | set style line 4 linecolor rgb "grey" lt 2 lw 4
29 | 
30 | set title "CDF latency of different data-reading implementation \n[Mixed precision: 8bit + 4 bit]"
31 | set xlabel "Latency (ms)"
32 | set ylabel "CDF"
33 | 
34 | set yrange [0:1]
35 | #set xrange [0:10000] # microsecond. Set this according to intended dimensions
36 | set xrange [0:4]
37 | 
38 | set key right bottom
39 | set key autotitle columnhead
40 | set datafile separator "," # CSV file is seperated with ,
41 | set output outputFile # set output file name to preference 
42 | 
43 | # adjust file names and legend labels for each line you plot
44 | plot \
45 |      file1 using 2:1 with lines ls 1 title "drop-caches (every 0.25 s)", \
46 |      file2 using 2:1 with lines ls 2 title "O-DIRECT flag (read 8bit)", \
47 |      file3 using 2:1 with lines ls 3 title "O-DIRECT flag (read 8bit + 4bit)", \
48 |      file4 using 2:1 with lines ls 4 title "no drop-caches", \
49 | 
50 | print "output   : ", outputFile
51 | #     write_io_data using 1:2 with lines ls 3 title "Write (avg: ".write_io_avg." us)", \
52 | #     read_io_data using 1:2 with lines ls 2 title "Read (avg: ".read_io_avg." us)", \
53 | 


--------------------------------------------------------------------------------
/script/gnuplot_cdf_evlfu_lru.plt:
--------------------------------------------------------------------------------
 1 | #!/bin/gnuplot --persist
 2 | 
 3 | # Sample run: gnuplot -e "input_dir='/mnt/extra/ev-store-dlrm/logs/inference=0.0022/use-evstore=True/extra-mem-load=8500/use-emb-cache=True/ev-lookup-only=True/cache-size=230000/emb-stor=sqlite/'" /mnt/extra/ev-store-dlrm/script/gnuplot_cdf_evlfu_lru.plt
 4 | # Make sure in that input_dir, you have evlfu-cdf.csv and lru-cdf.csv
 5 | 
 6 | print "input_dir  : ", input_dir
 7 | 
 8 | file1=input_dir."/evlfu-cdf.csv"
 9 | file2=input_dir."/lru-cdf.csv"
10 | 
11 | # set terminal qt 1
12 | # set terminal postscript eps enhanced color 22 font ",19"
13 | # outputFile="cdf_evlfu_vs_lru.eps"
14 | 
15 | set terminal pdf font ",19"
16 | # set terminal pdf font ",21"
17 | outputFile=input_dir."/cdf_evlfu_vs_lru.pdf"
18 | 
19 | 
20 | # set term png
21 | set size 0.8,1
22 | 
23 | set style line 1 linecolor rgb "green" lt 2 lw 4
24 | set style line 2 linecolor rgb "red" lt 1 lw 4
25 | set style line 3 linecolor rgb "blue" lt 1 lw 4
26 | 
27 | set title "CDF Latency of EV-LFU vs LRU"
28 | set xlabel "Latency (ms)"
29 | set ylabel "CDF"
30 | 
31 | set yrange [0:1]
32 | #set xrange [0:10000] # microsecond. Set this according to intended dimensions
33 | set xrange [0:4]
34 | 
35 | set key right bottom
36 | set key autotitle columnhead
37 | set datafile separator "," # CSV file is seperated with ,
38 | set output outputFile # set output file name to preference 
39 | 
40 | # adjust file names and legend labels for each line you plot
41 | plot \
42 |      file1 using 2:1 with lines ls 1 title "EV-LFU", \
43 |      file2 using 2:1 with lines ls 2 title "LRU", \
44 | 
45 | print "output   : ", outputFile
46 | #     write_io_data using 1:2 with lines ls 3 title "Write (avg: ".write_io_avg." us)", \
47 | #     read_io_data using 1:2 with lines ls 2 title "Read (avg: ".read_io_avg." us)", \
48 | 


--------------------------------------------------------------------------------
/script/gnuplot_cdf_multi_line.plt:
--------------------------------------------------------------------------------
 1 | #!/bin/gnuplot --persist
 2 | 
 3 | input_dir="/mnt/extra/ev-store-dlrm/logs/inference=0.0022/use-evstore=True/extra-mem-load=8500/use-emb-cache=True/ev-lookup-only=False/cache-size=100000/emb-stor=sqlite"
 4 | # input_dir="/mnt/extra/ev-store-dlrm/logs/inference=0.0022/use-evstore=True/extra-mem-load=8500/use-emb-cache=True/ev-lookup-only=False/cache-size=100000/emb-stor=filepy"
 5 | 
 6 | # Edit the content of this file 
 7 | # Sample run: gnuplot script/gnuplot_cdf_multi_line.plt
 8 | # Make sure in that input_dir, you have evlfu-cdf.csv and lru-cdf.csv
 9 | 
10 | file1="/evlfu-cdf.csv"
11 | file2="/approx-emb-threshold=15/evlfu-cdf.csv"
12 | file3="/approx-emb-threshold=21/evlfu-cdf.csv"
13 | file4="/approx-emb-threshold=22/evlfu-cdf.csv"
14 | file5="/approx-emb-threshold=23/evlfu-cdf.csv"
15 | 
16 | # set terminal qt 1
17 | # set terminal postscript eps enhanced color 22 font ",19"
18 | # outputFile="cdf_evlfu_vs_lru.eps"
19 | 
20 | set terminal pdf
21 | outputFile=input_dir."/cdf_multi_line.pdf"
22 | 
23 | print "input_dir  : ", input_dir
24 | 
25 | # set term png
26 | set size 0.8,1
27 | 
28 | set style line 1 linecolor rgb "red" lt 1 lw 3
29 | set style line 2 linecolor rgb "green" lt 1 lw 3
30 | set style line 3 linecolor rgb "cyan" lt 1 lw 3
31 | set style line 4 linecolor rgb "orange" lt 1 lw 3
32 | set style line 5 linecolor rgb "pink" lt 1 lw 3
33 | 
34 | set title "CDF Latency of EV-LFU vs LRU"
35 | set xlabel "Latency (ms)"
36 | set ylabel "CDF"
37 | 
38 | set yrange [0:1]
39 | #set xrange [0:10000] # microsecond. Set this according to intended dimensions
40 | set xrange [0:4]
41 | 
42 | set key right bottom
43 | set key autotitle columnhead
44 | set datafile separator "," # CSV file is seperated with ,
45 | set output outputFile # set output file name to preference 
46 | 
47 | # adjust file names and legend labels for each line you plot
48 | plot \
49 |      input_dir.file1 using 2:1 with lines ls 1 title file1, \
50 |      input_dir.file2 using 2:1 with lines ls 2 title file2, \
51 |      input_dir.file3 using 2:1 with lines ls 3 title file3, \
52 |      input_dir.file4 using 2:1 with lines ls 4 title file4, \
53 |      input_dir.file5 using 2:1 with lines ls 5 title file5, \
54 | 
55 | print "output   : ", outputFile
56 | #     write_io_data using 1:2 with lines ls 3 title "Write (avg: ".write_io_avg." us)", \
57 | #     read_io_data using 1:2 with lines ls 2 title "Read (avg: ".read_io_avg." us)", \
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | # ========================== UNUSED ==========================
65 | # ========================== UNUSED ==========================
66 | # ========================== UNUSED ==========================
67 | # plot \
68 | #      input_dir.file1 using 2:1 with lines ls 1 title "vanilla EV-LFU [AUC = 0.80]", \
69 | #      input_dir.file2 using 2:1 with lines ls 2 title "EV-LFU + Approx-Emb=15 [AUC = 0.55]", \
70 | #      input_dir.file3 using 2:1 with lines ls 3 title "EV-LFU + Approx-Emb=21 [AUC = 0.64]", \
71 | #      input_dir.file4 using 2:1 with lines ls 4 title "EV-LFU + Approx-Emb=22 [AUC = 0.69]", \
72 | #      input_dir.file5 using 2:1 with lines ls 5 title "EV-LFU + Approx-Emb=23 [AUC = 0.74]", \


--------------------------------------------------------------------------------
/script/gnuplot_graph/cdf_2_line.plt:
--------------------------------------------------------------------------------
 1 | #!/bin/gnuplot --persist
 2 | 
 3 | # base_input_dir="/mnt/extra/ev-store-dlrm/logs/inference=0.0022/use-evstore=True/extra-mem-load=8500/use-emb-cache=True/ev-lookup-only=False/cache-size=100000/emb-stor=sqlite"
 4 | base_input_dir="/mnt/extra/ev-store-dlrm/logs-old/inference=0.003/use-evstore=True/use-emb-cache=True/ev-lookup-only=False/cache-size=60340/emb-stor=cpp_caching_layer/cpp_algo"
 5 | 
 6 | # C1-C2-C3 eval!! Sell C3...
 7 |   # cache-size=60340
 8 |     # 49-49-2/1000n-euclid003-newrank-227/cpp_algo-cdf.csv
 9 |     # 8bit-4bit/cpp_algo-cdf.csv
10 |     # 48-48-4/1000n-euclid003-newrank/cpp_algo-cdf.csv
11 | # Edit the content of this file 
12 | # Sample run: 
13 |     # cd /mnt/extra/ev-store-dlrm
14 |     # gnuplot script/gnuplot_graph/cdf_2_line.plt
15 | # Make sure in that base_input_dir, you have evlfu-cdf.csv and lru-cdf.csv
16 | file1="/8bit-4bit/cpp_algo-cdf.csv"
17 | file2="/49-49-2/1000n-euclid003-newrank-227/cpp_algo-cdf.csv"
18 | file3="/48-48-4/1000n-euclid003-newrank/cpp_algo-cdf.csv"
19 | file4="/approx-emb-threshold=22/evlfu-cdf.csv"
20 | file5="/approx-emb-threshold=23/evlfu-cdf.csv"
21 | 
22 | # set terminal qt 1
23 | # set terminal postscript eps enhanced color 22 font ",19"
24 | # outputFile="cdf_evlfu_vs_lru.eps"
25 | 
26 | set terminal pdf
27 | outputFile=base_input_dir."/cdf_multi_line-1.pdf"
28 | 
29 | print "base_input_dir  : ", base_input_dir
30 | 
31 | # set term png
32 | set size 0.6,0.7
33 | 
34 | set style line 1 linecolor rgb "red" lt 1 lw 3
35 | set style line 2 linecolor rgb "green" lt 1 lw 3
36 | set style line 3 linecolor rgb "cyan" lt 1 lw 3
37 | set style line 4 linecolor rgb "orange" lt 1 lw 3
38 | set style line 5 linecolor rgb "pink" lt 1 lw 3
39 | 
40 | # set title "CDF Latency "
41 | set title "CDF Latency Comparison [L1 = 8bit; L2 = 4bit]\n [L3 is using 1000N-euclid method] [Cache size is 20%]"
42 | set xlabel "Latency (ms)"
43 | set ylabel "CDF"
44 | 
45 | set yrange [0:1]
46 | #set xrange [0:10000] # microsecond. Set this according to intended dimensions
47 | set xrange [0:4]
48 | 
49 | set key right bottom
50 | set key autotitle columnhead
51 | set datafile separator "," # CSV file is seperated with ,
52 | set output outputFile # set output file name to preference 
53 | 
54 | # adjust file names and legend labels for each line you plot
55 | plot \
56 |      base_input_dir.file1 using 2:1 with lines ls 1 title "L1:L2 (size = 50:50)", \
57 |      base_input_dir.file2 using 2:1 with lines ls 2 title "L1:L2:L3 (size = 49:49:2)", \
58 |      base_input_dir.file3 using 2:1 with lines ls 3 title "L1:L2:L3 (size = 48:48:4)", \
59 |      #base_input_dir.file4 using 2:1 with lines ls 4 title file4, \
60 |      #base_input_dir.file5 using 2:1 with lines ls 5 title file5, \
61 | 
62 | print "output   : ", outputFile
63 | #     write_io_data using 1:2 with lines ls 3 title "Write (avg: ".write_io_avg." us)", \
64 | #     read_io_data using 1:2 with lines ls 2 title "Read (avg: ".read_io_avg." us)", \
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | # ========================== UNUSED ==========================
72 | # ========================== UNUSED ==========================
73 | # ========================== UNUSED ==========================
74 | # plot \
75 | #      base_input_dir.file1 using 2:1 with lines ls 1 title "vanilla EV-LFU [AUC = 0.80]", \
76 | #      base_input_dir.file2 using 2:1 with lines ls 2 title "EV-LFU + Approx-Emb=15 [AUC = 0.55]", \
77 | #      base_input_dir.file3 using 2:1 with lines ls 3 title "EV-LFU + Approx-Emb=21 [AUC = 0.64]", \
78 | #      base_input_dir.file4 using 2:1 with lines ls 4 title "EV-LFU + Approx-Emb=22 [AUC = 0.69]", \
79 | #      base_input_dir.file5 using 2:1 with lines ls 5 title "EV-LFU + Approx-Emb=23 [AUC = 0.74]", \


--------------------------------------------------------------------------------
/script/modify_param.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pathlib import Path
 6 | import os
 7 | import argparse
 8 | import fileinput
 9 | 
10 | 
11 | def find_and_replace_variable(file_path, variable_name, desired_value):
12 |     replaced = False
13 |     if ("-" in desired_value and variable_name == "SIZE_PROPORTION"):
14 |         # desired_value is a STRING
15 |         for line in fileinput.input(file_path, inplace=1):
16 |             if (not replaced) and (variable_name in line):
17 |                 # x="#define SIZE_PROPORTION       "30-50-20"  // 30-50-
18 |                 # we ASSUME that the value of this string MUST contains a '-'
19 |                 the_target_str = [s for s in line.split(" ") if ("-" in s)][0]
20 |                 # replacing the first occurence with the desired_value
21 |                 line = line.replace(str(the_target_str), str("\"" + desired_value + "\"") , 1)
22 |                 replaced = True
23 |             sys.stdout.write(line)
24 |     elif (desired_value.isdigit()):
25 |         # desired_value is an INT
26 |         for line in fileinput.input(file_path, inplace=1):
27 |             if (not replaced) and (variable_name in line):
28 |                 # x="#define MAIN_PRECISION        8           // 32, 16, 8, or 4"
29 |                 the_first_number = [int(s) for s in line.split(" ") if s.isdigit()][0]
30 |                 # replacing the first occurence with the desired_value
31 |                 line = line.replace(str(the_first_number), desired_value, 1)
32 |                 replaced = True
33 |             sys.stdout.write(line)
34 |     elif (variable_name == "ctl.load_library"):
35 |         print("   Changing the cpp caching library to be loaded by Ctypes")
36 |         for line in fileinput.input(file_path, inplace=1):
37 |             if (not replaced) and (variable_name in line):
38 |                 # line="cache_manager_cpp = ctl.load_library('libcachemanager.so', libdir)
39 |                 # the_target_str="ctl.load_library('libcachemanager.so',"
40 |                 the_target_str = [s for s in line.split(" ") if ("ctl.load_library" in s)][0]
41 |                 the_target_word = [s for s in the_target_str.split("'") if (".so" in s)][0]
42 |                 # replacing the first occurence with the desired_value
43 |                 line = line.replace(str(the_target_word), str( desired_value ) , 1)
44 |                 replaced = True
45 |             sys.stdout.write(line)
46 |     else :
47 |         # params modification is UNKNOWN!
48 |         print("ERROR: params modification is UNKNOWN! " + variable_name + " " + desired_value)
49 |         exit(-1)
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("-file", help="File path of the code to edit",type=str)
54 |     parser.add_argument("-params", help="Parameter to modify and the desired value",type=str)
55 |     args = parser.parse_args()
56 |     if (not args.file):
57 |         print("ERROR: You must provide at least 1 arg: -file <the input file> ")
58 |         exit(-1)
59 |     else:
60 |         print(args.file)
61 |         print(args.params)
62 | 
63 |         arr_params = args.params.split(" ")
64 |         print(arr_params)
65 | 
66 |         # iterate through each variables
67 |         for param in arr_params:
68 |             variable_name = param.split('=')[0]
69 |             desired_value = param.split('=')[1]
70 |             print("Will change the value of " + variable_name + " to " + desired_value)
71 | 
72 |             find_and_replace_variable(args.file, variable_name, desired_value)
73 | 
74 |     print("Done")
75 | # ./modify_param.py -file /mnt/extra/ev-store-dlrm/mixed_precs_caching/cache_manager.cpp -params "N_CACHING_LAYER=2 MAIN_PRECISION=32 SECONDARY_PRECISION=4 TOTAL_SIZE=15085"
76 | 


--------------------------------------------------------------------------------
/script/mount_cham_obj_stor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/zsh
 2 | cd /mnt/extra/
 3 | mkdir -p cham_obj_stor
 4 | # sudo umount -l cham_obj_stor
 5 | cc-cloudfuse unmount cham_obj_stor
 6 | cc-cloudfuse mount cham_obj_stor
 7 | cd /mnt/extra/cham_obj_stor/ev-store-dataset/$1
 8 | echo "==================================================="
 9 | echo "Copy the script below; run one by one!"
10 | echo ""
11 | for item_to_copy in *; do
12 |     echo "echo \"Copying $item_to_copy \""
13 |     echo "cd /mnt/extra/; cc-cloudfuse unmount cham_obj_stor; cc-cloudfuse mount cham_obj_stor"
14 |     echo "cd " `pwd`
15 |     echo "mkdir -p /mnt/extra/ev-store-dlrm/$1"
16 |     echo "# BREAK THE COPY HERE ======= "
17 |     echo "cp -R -u /mnt/extra/cham_obj_stor/ev-store-dataset/$1/$item_to_copy   /mnt/extra/ev-store-dlrm/$1"
18 |     echo "cd /mnt/extra/\n"
19 | done
20 | # cd compressed4git-binary
21 | # ls


--------------------------------------------------------------------------------
/script/plot_cdf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pathlib import Path
 6 | import os
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--input-file", help="File path of the raw cdf data", type=str)
11 | 
12 | args = parser.parse_args()
13 | 
14 | parentDir = str(Path(args.input_file).parent)
15 | filename = os.path.basename(args.input_file).split('.')[0]
16 | output_path = os.path.join(parentDir, filename + ".png")
17 | 
18 | df=pd.read_csv(args.input_file, sep=',')
19 | x_label = df.columns[1]
20 | y_label = df.columns[0]
21 | 
22 | # define data values
23 | plt.plot(df[x_label], df[y_label])  # Plot the chart
24 | # plt.show()  # display
25 | plt.title('CDF of Latency')
26 | plt.xlabel(x_label)
27 | plt.ylabel("CDF")
28 | 
29 | # set y and x axis range
30 | plt.ylim(ymin=0)  # this line
31 | plt.ylim(ymax=1)  # this line
32 | plt.xlim(xmin=0)
33 | plt.xlim(xmax=4)
34 | 
35 | plt.savefig(output_path)
36 | print("CDF figure is written at : " + output_path)
37 | 


--------------------------------------------------------------------------------
/script/read_cham_obj_stor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/zsh
 2 | # Auto retry!!
 3 | echo "This script won't work due to connection lost"
 4 | exit
 5 | n=0
 6 | until [ "$n" -ge 350 ]
 7 | do  
 8 |     # Re mount 
 9 |     cd /mnt/extra/
10 |     cc-cloudfuse unmount cham_obj_stor
11 |     sleep 3
12 |     cc-cloudfuse mount cham_obj_stor
13 |     cd /mnt/extra/cham_obj_stor/ev-store-dataset/
14 |     ls
15 |     cd $1
16 |     ls
17 |     for item_to_copy in */; do
18 |         echo "Copying $item_to_copy"
19 | 
20 |         cd $item_to_copy
21 |         ls
22 | 
23 |         break
24 |         # Do internal retry
25 |         m=0
26 |         until [ "$m" -ge 350 ]
27 |         do
28 |             # Re mount 
29 |             cd /mnt/extra/
30 |             cc-cloudfuse unmount cham_obj_stor
31 |             cc-cloudfuse mount cham_obj_stor
32 |             cd /mnt/extra/cham_obj_stor/ev-store-dataset/
33 |             # ls
34 |             cd $1
35 |             ls
36 |             cd $item_to_copy
37 |             ls
38 |             mkdir -p /mnt/extra/ev-store-dlrm/$1/$item_to_copy/
39 |             break
40 | 
41 |             sleep 2
42 |             cp -R $item_to_copy /mnt/extra/ev-store-dlrm/$1/$item_to_copy/ && break  # substitute your command here
43 |             # break
44 |             n=$((n+1)) 
45 |             sleep 3
46 |         done
47 |         break
48 |     done
49 |     # echo $1
50 |     break
51 |     # openstack container list
52 |     sleep 2
53 |     cd $1
54 |     ls $1
55 |     echo "Retrying ... $n"
56 |     # cp -R -n $1 $2 && break  # substitute your command here
57 |     # rsync -aq criteo_kaggle_all /mnt/extra/ev-store-dlrm/stored_model/ && break  # substitute your command here
58 |     # cp -R -u -p criteo_kaggle_all /mnt/extra/ev-store-dlrm/stored_model/ && break  # substitute your command here
59 |     n=$((n+1)) 
60 |     sleep 3
61 | done


--------------------------------------------------------------------------------
/script/uncompress_folder_for_github.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | LIST_FILES=compressed4git-list-files.txt
 3 | 
 4 | if [ "$#" -eq 0 ]; then
 5 |     echo "ERROR: Need 1 arguments: <input_dir>"
 6 |     echo "       Example: ./script/uncompress_folder_for_github.sh  stored_model/criteo_kaggle_5mil/epoch-00/compressed4git-ev-table output-folder-name"
 7 |     exit
 8 | fi
 9 | 
10 | echo "==========================================="
11 | echo "Uncompress the files inside folder = $1"
12 | echo "    will NOT uncompress any inner folder!"
13 | echo "==========================================="
14 | dirname=`basename $1`
15 | parentdir="$(dirname "$1")"
16 | # create the new folder to hold the compressed files
17 | new_folder=$parentdir/"un-$dirname"
18 | rm -rf $new_folder; mkdir $new_folder
19 | 
20 | cd $1
21 | cat $LIST_FILES | while read file 
22 | do
23 |   echo "UN-Compressing file = $file "
24 |   # join the parts
25 |   cat $file.tar.gz.part* > $file.tar.gz.joined
26 |   
27 |   # uncompress
28 |   tar -zxvf $file.tar.gz.joined
29 |   # move to the uncompressed folder
30 |   mv $file ../"un-$dirname"/
31 | 
32 |   # remove the joined tar.gz files
33 |   rm -rf $file.tar.gz.joined
34 | done
35 | 
36 | if [ -z "$2" ]
37 | then
38 |       echo "$2 is empty; the output dir won't be renamed!"
39 | else
40 |       cd ../
41 |       # pwd
42 |       echo "... rename the output dir to = $2"
43 |       rm -rf $2
44 |       mv "un-$dirname" $2
45 |       new_folder=$parentdir/$2
46 | fi
47 | 
48 | echo "Done, folder is ready for Github push: $new_folder"
49 | echo "      remove old folder: rm -rf $1"


--------------------------------------------------------------------------------
/script/wget_evstore_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -eq 0 ]; then
 3 |     echo "ERROR: Need 1 argument:<object_dir to download>"
 4 |     echo "       Example: ./script/wget_evstore_dataset.sh input/compressed4git-criteo_kaggle_5mil/compressed4git-list-files.txt"
 5 |     exit
 6 | fi
 7 | 
 8 | # obj_path="input/compressed4git-criteo_kaggle_5mil/compressed4git-list-files.txt"
 9 | base_url="https://chi.uc.chameleoncloud.org:7480/swift/v1/AUTH_64de01b64f854410a6aead305682bd62/ev-store-dataset"
10 | obj_path=$1
11 | outdir=$(dirname "$obj_path")
12 | filename=$(basename "$obj_path")
13 | 
14 | # echo $root_dir
15 | echo "Adding $filename to $outdir"
16 | wget "$base_url/$obj_path"
17 | mkdir -p $outdir
18 | mv $filename $outdir
19 | 
20 | 


--------------------------------------------------------------------------------
/stored_model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | # !criteo_kaggle_5mil
3 | # !criteo_kaggle_all
4 | !.gitignore


--------------------------------------------------------------------------------
/test/dlrm_s_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #WARNING: must have compiled PyTorch and caffe2
 8 | 
 9 | #check if extra argument is passed to the test
10 | if [[ $# == 1 ]]; then
11 |     dlrm_extra_option=$1
12 | else
13 |     dlrm_extra_option=""
14 | fi
15 | #echo $dlrm_extra_option
16 | 
17 | dlrm_py="python dlrm_s_pytorch.py"
18 | dlrm_c2="python dlrm_s_caffe2.py"
19 | 
20 | echo "Running commands ..."
21 | #run pytorch
22 | echo $dlrm_py
23 | $dlrm_py --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp1
24 | $dlrm_py --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp2
25 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp3
26 | $dlrm_py --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ppp4
27 | 
28 | #run caffe2
29 | echo $dlrm_c2
30 | $dlrm_c2 --mini-batch-size=1 --data-size=1 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc1
31 | $dlrm_c2 --mini-batch-size=2 --data-size=4 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc2
32 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=1 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc3
33 | $dlrm_c2 --mini-batch-size=2 --data-size=5 --nepochs=3 --arch-interaction-op=dot --learning-rate=0.1 --debug-mode $dlrm_extra_option > ccc4
34 | 
35 | echo "Checking results ..."
36 | #check results
37 | #WARNING: correct test will have no difference in numeric values
38 | #(but might have some verbal difference, e.g. due to warnnings)
39 | #in the output file
40 | echo "diff test1 (no numeric values in the output = SUCCESS)"
41 | diff ccc1 ppp1
42 | echo "diff test2 (no numeric values in the output = SUCCESS)"
43 | diff ccc2 ppp2
44 | echo "diff test3 (no numeric values in the output = SUCCESS)"
45 | diff ccc3 ppp3
46 | echo "diff test4 (no numeric values in the output = SUCCESS)"
47 | diff ccc4 ppp4
48 | 


--------------------------------------------------------------------------------
/tricks/md_embedding_bag.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Mixed-Dimensions Trick
 7 | #
 8 | # Description: Applies mixed dimension trick to embeddings to reduce
 9 | # embedding sizes.
10 | #
11 | # References:
12 | # [1] Antonio Ginart, Maxim Naumov, Dheevatsa Mudigere, Jiyan Yang, James Zou,
13 | # "Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation
14 | # Systems", CoRR, arXiv:1909.11810, 2019
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | import torch
17 | import torch.nn as nn
18 | 
19 | 
20 | def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None):
21 |     '''
22 |     An external facing function call for mixed-dimension assignment
23 |     with the alpha power temperature heuristic
24 |     Inputs:
25 |     n -- (torch.LongTensor) ; Vector of num of rows for each embedding matrix
26 |     alpha -- (torch.FloatTensor); Scalar, non-negative, controls dim. skew
27 |     d0 -- (torch.FloatTensor); Scalar, baseline embedding dimension
28 |     B -- (torch.FloatTensor); Scalar, parameter budget for embedding layer
29 |     round_dim -- (bool); flag for rounding dims to nearest pow of 2
30 |     k -- (torch.LongTensor) ; Vector of average number of queries per inference
31 |     '''
32 |     n, indices = torch.sort(n)
33 |     k = k[indices] if k is not None else torch.ones(len(n))
34 |     d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B)
35 |     if round_dim:
36 |         d = pow_2_round(d)
37 |     undo_sort = [0] * len(indices)
38 |     for i, v in enumerate(indices):
39 |         undo_sort[v] = i
40 |     return d[undo_sort]
41 | 
42 | 
43 | def alpha_power_rule(n, alpha, d0=None, B=None):
44 |     if d0 is not None:
45 |         lamb = d0 * (n[0].type(torch.float) ** alpha)
46 |     elif B is not None:
47 |         lamb = B / torch.sum(n.type(torch.float) ** (1 - alpha))
48 |     else:
49 |         raise ValueError("Must specify either d0 or B")
50 |     d = torch.ones(len(n)) * lamb * (n.type(torch.float) ** (-alpha))
51 |     for i in range(len(d)):
52 |         if i == 0 and d0 is not None:
53 |             d[i] = d0
54 |         else:
55 |             d[i] = 1 if d[i] < 1 else d[i]
56 |     return (torch.round(d).type(torch.long))
57 | 
58 | 
59 | def pow_2_round(dims):
60 |     return 2 ** torch.round(torch.log2(dims.type(torch.float)))
61 | 
62 | 
63 | class PrEmbeddingBag(nn.Module):
64 |     def __init__(self, num_embeddings, embedding_dim, base_dim):
65 |         super(PrEmbeddingBag, self).__init__()
66 |         self.embs = nn.EmbeddingBag(
67 |             num_embeddings, embedding_dim, mode="sum", sparse=True)
68 |         torch.nn.init.xavier_uniform_(self.embs.weight)
69 |         if embedding_dim < base_dim:
70 |             self.proj = nn.Linear(embedding_dim, base_dim, bias=False)
71 |             torch.nn.init.xavier_uniform_(self.proj.weight)
72 |         elif embedding_dim == base_dim:
73 |             self.proj = nn.Identity()
74 |         else:
75 |             raise ValueError(
76 |                 "Embedding dim " + str(embedding_dim) + " > base dim " + str(base_dim)
77 |             )
78 | 
79 |     def forward(self, input, offsets=None, per_sample_weights=None):
80 |         return self.proj(self.embs(
81 |             input, offsets=offsets, per_sample_weights=per_sample_weights))
82 | 


--------------------------------------------------------------------------------