├── .gitignore ├── LICENSE ├── Note ├── DL │ ├── dl │ │ ├── segment_data.py │ │ └── test.py │ ├── kernel.py │ └── parallel │ │ ├── kernel.py │ │ └── kernel_pytorch.py ├── RL │ ├── kernel.py │ ├── parallel │ │ ├── kernel.py │ │ └── kernel_pytorch.py │ └── rl │ │ ├── __init__.py │ │ ├── animate_agent.py │ │ ├── noise.py │ │ ├── policy.py │ │ └── prioritized_replay.py ├── models │ ├── docs_example │ │ ├── DL │ │ │ ├── model1.py │ │ │ ├── model2.py │ │ │ ├── model3.py │ │ │ ├── model4.py │ │ │ └── model5.py │ │ └── RL │ │ │ ├── keras │ │ │ ├── DDPG.py │ │ │ ├── DDPG_HER.py │ │ │ ├── DQN.py │ │ │ ├── DQN_PR.py │ │ │ ├── MADDPG.py │ │ │ ├── PPO.py │ │ │ └── pool_network │ │ │ │ ├── DQN.py │ │ │ │ └── DQN_PR.py │ │ │ ├── note │ │ │ ├── DDPG.py │ │ │ ├── DDPG_HER.py │ │ │ ├── DQN.py │ │ │ ├── DQN_IRL.py │ │ │ ├── DQN_PR.py │ │ │ ├── MADDPG.py │ │ │ ├── PPO.py │ │ │ ├── Rainbow.py │ │ │ └── pool_network │ │ │ │ ├── DDPG_HER.py │ │ │ │ ├── DQN.py │ │ │ │ ├── DQN_PR.py │ │ │ │ ├── MADDPG.py │ │ │ │ └── Rainbow.py │ │ │ └── pytorch │ │ │ ├── DDPG.py │ │ │ ├── DDPG_HER.py │ │ │ ├── DQN.py │ │ │ ├── DoubleDQN.py │ │ │ ├── DuelingDQN.py │ │ │ ├── MADDPG.py │ │ │ └── pool_network │ │ │ ├── DDPG.py │ │ │ ├── DDPG_HER.py │ │ │ ├── DQN.py │ │ │ ├── DoubleDQN.py │ │ │ ├── DuelingDQN.py │ │ │ └── MADDPG.py │ ├── note │ │ ├── BertModel.py │ │ ├── BiT.py │ │ ├── CLIP.py │ │ ├── ConvNeXt.py │ │ ├── ConvNeXtV2.py │ │ ├── DenseNet │ │ │ ├── DenseNet121.py │ │ │ ├── DenseNet169.py │ │ │ └── DenseNet201.py │ │ ├── EfficientNet.py │ │ ├── EfficientNetV2.py │ │ ├── Gemma.py │ │ ├── Inception │ │ │ ├── InceptionResNetV2.py │ │ │ └── InceptionV3.py │ │ ├── Llama2.py │ │ ├── MobileNet.py │ │ ├── MobileNetV2.py │ │ ├── MobileNetV3.py │ │ ├── RegNet.py │ │ ├── ResNet │ │ │ ├── ResNet101.py │ │ │ ├── ResNet152.py │ │ │ └── ResNet50.py │ │ ├── ResNetRS.py │ │ ├── Segformer.py │ │ ├── SwiftFormer.py │ │ ├── VGG16.py │ │ ├── VGG19.py │ │ ├── ViT.py │ │ ├── Whisper.py │ │ └── non_parallel │ │ │ ├── BertModel.py │ │ │ ├── BiT.py │ │ │ ├── CLIP.py │ │ │ ├── ConvNeXt.py │ │ │ ├── ConvNeXtV2.py │ │ │ ├── DenseNet │ │ │ ├── DenseNet121.py │ │ │ ├── DenseNet169.py │ │ │ └── DenseNet201.py │ │ │ ├── EfficientNet.py │ │ │ ├── EfficientNetV2.py │ │ │ ├── GPT2.py │ │ │ ├── Gemma.py │ │ │ ├── Llama2.py │ │ │ ├── MobileNet.py │ │ │ ├── MobileNetV2.py │ │ │ ├── MobileNetV3.py │ │ │ ├── RegNet.py │ │ │ ├── ResNet │ │ │ ├── ResNet101.py │ │ │ ├── ResNet152.py │ │ │ └── ResNet50.py │ │ │ ├── ResNetRS.py │ │ │ ├── Segformer.py │ │ │ ├── SwiftFormer.py │ │ │ ├── VGG19.py │ │ │ ├── ViT.py │ │ │ └── Whisper.py │ └── tf │ │ ├── BEiT.py │ │ ├── BEiT2.py │ │ ├── BertModel.py │ │ ├── BiT.py │ │ ├── CCT.py │ │ ├── CLIP.py │ │ ├── CaiT.py │ │ ├── ConViT.py │ │ ├── ConvNeXt.py │ │ ├── ConvNeXtV2.py │ │ ├── CrossViT.py │ │ ├── CvT.py │ │ ├── DaViT.py │ │ ├── DeepViT.py │ │ ├── DeiT.py │ │ ├── DenseNet │ │ ├── DenseNet121.py │ │ ├── DenseNet169.py │ │ └── DenseNet201.py │ │ ├── DiT.py │ │ ├── EfficientNet.py │ │ ├── EfficientNetV2.py │ │ ├── EfficientVit_mit.py │ │ ├── EfficientVit_msra.py │ │ ├── GCViT.py │ │ ├── GCViT_detection.py │ │ ├── Gemma.py │ │ ├── Llama.py │ │ ├── Llama2.py │ │ ├── Llama3.py │ │ ├── MiT.py │ │ ├── Mixtral.py │ │ ├── MobileNet.py │ │ ├── MobileNetV2.py │ │ ├── MobileNetV3.py │ │ ├── PVT.py │ │ ├── Phi2.py │ │ ├── Phi3.py │ │ ├── PiT.py │ │ ├── RDNet.py │ │ ├── RegNet.py │ │ ├── ResNet │ │ ├── ResNet101.py │ │ ├── ResNet152.py │ │ └── ResNet50.py │ │ ├── ResNetRS.py │ │ ├── Segformer.py │ │ ├── SwiftFormer.py │ │ ├── SwinMLP.py │ │ ├── SwinTransformerV2.py │ │ ├── VGG19.py │ │ ├── ViT.py │ │ ├── ViViT.py │ │ ├── Whisper.py │ │ └── XCiT.py ├── nn │ ├── Model.py │ ├── RL.py │ ├── RL_pytorch.py │ ├── Sequential.py │ ├── __init__.py │ ├── accuracy.py │ ├── activation.py │ ├── assign_param.py │ ├── coalesce_sparse.py │ ├── conv2d_func.py │ ├── cosine_similarity.py │ ├── create_additive_causal_mask.py │ ├── gather_mm.py │ ├── helpers.py │ ├── init.py │ ├── initializer.py │ ├── interpolate.py │ ├── lambda_callback.py │ ├── layer │ │ ├── BiRNN.py │ │ ├── BigBird_attention.py │ │ ├── BigBird_masks.py │ │ ├── ConvRNN.py │ │ ├── FAVOR_attention.py │ │ ├── GCN.py │ │ ├── GRU.py │ │ ├── GRUCell.py │ │ ├── LSTM.py │ │ ├── LSTMCell.py │ │ ├── Linformer_self_attention.py │ │ ├── LoRALinear.py │ │ ├── MoE_layer.py │ │ ├── PReLU.py │ │ ├── RMSNorm.py │ │ ├── RNN.py │ │ ├── RNNCell.py │ │ ├── RoPE.py │ │ ├── SwitchGLU.py │ │ ├── TLU.py │ │ ├── Transformer.py │ │ ├── TransformerDecoder.py │ │ ├── TransformerDecoderLayer.py │ │ ├── TransformerEncoder.py │ │ ├── TransformerEncoderLayer.py │ │ ├── adaptive_avg_pooling1d.py │ │ ├── adaptive_avg_pooling2d.py │ │ ├── adaptive_avg_pooling3d.py │ │ ├── adaptive_avgmax_pool.py │ │ ├── adaptive_max_pooling1d.py │ │ ├── adaptive_max_pooling2d.py │ │ ├── adaptive_max_pooling3d.py │ │ ├── add.py │ │ ├── additive_attention.py │ │ ├── alpha_dropout.py │ │ ├── attention.py │ │ ├── attention2d.py │ │ ├── attention_pool.py │ │ ├── attention_pool2d.py │ │ ├── average.py │ │ ├── avg_pool1d.py │ │ ├── avg_pool2d.py │ │ ├── avg_pool3d.py │ │ ├── axial_positional_encoding.py │ │ ├── batch_norm.py │ │ ├── bilinear.py │ │ ├── blur_pool.py │ │ ├── bottleneck_attn.py │ │ ├── cached_attention.py │ │ ├── capsule.py │ │ ├── cbam.py │ │ ├── classifier.py │ │ ├── concat.py │ │ ├── conv1d.py │ │ ├── conv1d_transpose.py │ │ ├── conv2d.py │ │ ├── conv2d_transpose.py │ │ ├── conv3d.py │ │ ├── conv3d_transpose.py │ │ ├── conv_bn_act.py │ │ ├── cropping1d.py │ │ ├── cropping2d.py │ │ ├── cropping3d.py │ │ ├── dense.py │ │ ├── depthwise_conv1d.py │ │ ├── depthwise_conv2d.py │ │ ├── dropout.py │ │ ├── dynamic_tanh.py │ │ ├── eca.py │ │ ├── einsum_dense.py │ │ ├── embedding.py │ │ ├── feed_forward_experts.py │ │ ├── filter_response_norm.py │ │ ├── flatten.py │ │ ├── format.py │ │ ├── gather_excite.py │ │ ├── gaussian_dropout.py │ │ ├── gaussian_noise.py │ │ ├── global_avg_pool1d.py │ │ ├── global_avg_pool2d.py │ │ ├── global_avg_pool3d.py │ │ ├── global_context.py │ │ ├── global_max_pool1d.py │ │ ├── global_max_pool2d.py │ │ ├── global_max_pool3d.py │ │ ├── grn.py │ │ ├── group_norm.py │ │ ├── grouped_query_attention.py │ │ ├── halo_attn.py │ │ ├── identity.py │ │ ├── image_preprocessing │ │ │ ├── center_crop.py │ │ │ ├── random_brightness.py │ │ │ ├── random_crop.py │ │ │ ├── random_height.py │ │ │ ├── random_rotation.py │ │ │ ├── random_translation.py │ │ │ ├── random_width.py │ │ │ ├── random_zoom.py │ │ │ ├── rescaling.py │ │ │ ├── resizing.py │ │ │ └── transform.py │ │ ├── interpolate.py │ │ ├── kernel_attention.py │ │ ├── lambda_layer.py │ │ ├── layer_norm.py │ │ ├── layer_scale.py │ │ ├── llama.py │ │ ├── lp_pool1d.py │ │ ├── lp_pool2d.py │ │ ├── lp_pool3d.py │ │ ├── masked_lm.py │ │ ├── masked_softmax.py │ │ ├── masking.py │ │ ├── matmul_with_margin.py │ │ ├── max_pool1d.py │ │ ├── max_pool2d.py │ │ ├── max_pool3d.py │ │ ├── maximum.py │ │ ├── maxout.py │ │ ├── minimum.py │ │ ├── ml_decoder.py │ │ ├── mlp.py │ │ ├── multi_cls_heads.py │ │ ├── multichannel_attention.py │ │ ├── multihead_attention.py │ │ ├── multiheadrelative_attention.py │ │ ├── multiply.py │ │ ├── non_local_attn.py │ │ ├── norm.py │ │ ├── patch_dropout.py │ │ ├── perdimscale_attention.py │ │ ├── permute.py │ │ ├── pos_embed.py │ │ ├── pos_embed_sincos.py │ │ ├── position_embedding.py │ │ ├── repeat_vector.py │ │ ├── reshape.py │ │ ├── reuse_multihead_attention.py │ │ ├── reversible_residual.py │ │ ├── router.py │ │ ├── select_topk.py │ │ ├── selective_kernel.py │ │ ├── self_attention_mask.py │ │ ├── separable_conv1d.py │ │ ├── separable_conv2d.py │ │ ├── softmax.py │ │ ├── space_to_depth.py │ │ ├── spatial_dropout1d.py │ │ ├── spatial_dropout2d.py │ │ ├── spatial_dropout3d.py │ │ ├── spectral_norm.py │ │ ├── split_attn.py │ │ ├── squeeze_excite.py │ │ ├── stochastic_depth.py │ │ ├── subtract.py │ │ ├── talking_heads_attention.py │ │ ├── thresholded_relu.py │ │ ├── two_stream_relative_attention.py │ │ ├── unfold.py │ │ ├── unit_norm.py │ │ ├── up_sampling1d.py │ │ ├── up_sampling2d.py │ │ ├── up_sampling3d.py │ │ ├── vector_quantizer.py │ │ ├── vision_transformer.py │ │ ├── voting_attention.py │ │ ├── zeropadding1d.py │ │ ├── zeropadding2d.py │ │ └── zeropadding3d.py │ ├── lr_finder.py │ ├── nan_to_num.py │ ├── narrow.py │ ├── opt_finder.py │ ├── optimizer │ │ ├── a2grad.py │ │ ├── accsgd.py │ │ ├── adabelief.py │ │ ├── adabound.py │ │ ├── adaboundw.py │ │ ├── adafactor_bv.py │ │ ├── adagc.py │ │ ├── adahessian.py │ │ ├── adai.py │ │ ├── adaiv2.py │ │ ├── adalite.py │ │ ├── adam_mini.py │ │ ├── adamax.py │ │ ├── adamg.py │ │ ├── adamod.py │ │ ├── adamp.py │ │ ├── adan.py │ │ ├── adanorm.py │ │ ├── adapnm.py │ │ ├── adashift.py │ │ ├── adasmooth.py │ │ ├── ademamix.py │ │ ├── adopt.py │ │ ├── aggmo.py │ │ ├── aida.py │ │ ├── alig.py │ │ ├── amos.py │ │ ├── apollo.py │ │ ├── asgd.py │ │ ├── avagrad.py │ │ ├── base_optimizer.py │ │ ├── came.py │ │ ├── dadaptadagrad.py │ │ ├── dadaptadam.py │ │ ├── dadaptadan.py │ │ ├── dadaptlion.py │ │ ├── dadaptsgd.py │ │ ├── diffgrad.py │ │ ├── exadam.py │ │ ├── fadam.py │ │ ├── fira.py │ │ ├── focus.py │ │ ├── fromage.py │ │ ├── galore.py │ │ ├── galore_projector.py │ │ ├── grams.py │ │ ├── gravity.py │ │ ├── grokfast.py │ │ ├── kate.py │ │ ├── kron.py │ │ ├── lamb.py │ │ ├── laprop.py │ │ ├── lars.py │ │ ├── lomo.py │ │ ├── lookahead.py │ │ ├── madgrad.py │ │ ├── mars.py │ │ ├── msvag.py │ │ ├── muon.py │ │ ├── nadam.py │ │ ├── nadamw.py │ │ ├── nero.py │ │ ├── nvnovograd.py │ │ ├── optimizer.py │ │ ├── orthograd.py │ │ ├── padam.py │ │ ├── parallel │ │ │ ├── adabelief.py │ │ │ ├── adabound.py │ │ │ ├── adaboundw.py │ │ │ ├── adalite.py │ │ │ ├── adamod.py │ │ │ ├── adamp.py │ │ │ └── radam.py │ │ ├── pcgrad.py │ │ ├── pid.py │ │ ├── pnm.py │ │ ├── prodigy.py │ │ ├── qhadam.py │ │ ├── qhm.py │ │ ├── racs.py │ │ ├── radam.py │ │ ├── ranger.py │ │ ├── ranger2020.py │ │ ├── ranger21.py │ │ ├── ranger25.py │ │ ├── rangerqh.py │ │ ├── rangerva.py │ │ ├── sam.py │ │ ├── scion.py │ │ ├── sgdp.py │ │ ├── sgdw.py │ │ ├── shampoo.py │ │ ├── signsgd.py │ │ ├── sm3.py │ │ ├── soap.py │ │ ├── sophia.py │ │ ├── spam.py │ │ ├── srmm.py │ │ ├── swats.py │ │ ├── tam.py │ │ ├── tiger.py │ │ ├── trac.py │ │ └── yogi.py │ ├── pairwise_distance.py │ ├── parallel │ │ ├── assign_device.py │ │ ├── assign_device_pytorch.py │ │ └── optimizer.py │ ├── parallel_finder.py │ ├── parallel_finder_rl.py │ ├── parameter.py │ ├── pos_embed.py │ ├── positional_encoding.py │ ├── restore.py │ ├── scaled_dot_product_attention.py │ ├── softplus.py │ ├── solve_triangular.py │ └── sparse_mask.py ├── sr.py └── version.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Note/DL/dl/segment_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def segment_data(data,labels,process): 5 | if len(data)!=process: 6 | data=np.array_split(data,process) 7 | labels=np.array_split(labels,process) 8 | return data,labels 9 | -------------------------------------------------------------------------------- /Note/RL/rl/__init__.py: -------------------------------------------------------------------------------- 1 | from Note.RL.rl.policy import * 2 | from Note.RL.rl.noise import * 3 | -------------------------------------------------------------------------------- /Note/RL/rl/animate_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.animation as animation 4 | 5 | 6 | class animate_agent: 7 | def __init__(self,agent,env,platform='tf'): 8 | self.agent=agent 9 | self.env=env 10 | self.platform=platform 11 | 12 | 13 | def run_agent(self, max_steps, seed=None): 14 | state_history = [] 15 | 16 | steps = 0 17 | reward_ = 0 18 | if seed==None: 19 | state = self.nn.genv.reset() 20 | else: 21 | state = self.nn.genv.reset(seed=seed) 22 | for step in range(max_steps): 23 | if self.platform=='tf': 24 | if not hasattr(self, 'noise'): 25 | action = np.argmax(self.agent.nn(state)) 26 | else: 27 | action = self.agent.actor(state).numpy() 28 | elif self.platform=='pytorch': 29 | if not hasattr(self, 'noise'): 30 | action = np.argmax(self.agent.nn(state)) 31 | else: 32 | action = self.agent.actor(state).detach().numpy() 33 | next_state, reward, done, _ = self.env.step(action) 34 | state_history.append(state) 35 | steps+=1 36 | reward_+=reward 37 | if done: 38 | break 39 | state = next_state 40 | 41 | return state_history,reward_,steps 42 | 43 | 44 | def __call__(self, max_steps, mode='rgb_array', save_path=None, fps=None, writer='imagemagick'): 45 | state_history,reward,steps = self.run_agent(max_steps) 46 | 47 | fig = plt.figure() 48 | ax = fig.add_subplot() 49 | self.env.reset() 50 | img = ax.imshow(self.env.render(mode=mode)) 51 | 52 | def update(frame): 53 | img.set_array(self.env.render(mode=mode)) 54 | return [img] 55 | 56 | ani = animation.FuncAnimation(fig, update, frames=state_history, blit=True) 57 | plt.show() 58 | 59 | print('steps:{0}'.format(steps)) 60 | print('reward:{0}'.format(reward)) 61 | 62 | if save_path!=None: 63 | ani.save(save_path, writer=writer, fps=fps) 64 | return 65 | -------------------------------------------------------------------------------- /Note/RL/rl/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RandomProcess(object): 5 | def reset_states(self): 6 | pass 7 | 8 | 9 | class AnnealedGaussianProcess(RandomProcess): 10 | def __init__(self, mu, sigma, sigma_min, n_steps_annealing): 11 | self.mu = mu 12 | self.sigma = sigma 13 | self.n_steps = 0 14 | 15 | if sigma_min is not None: 16 | self.m = -float(sigma - sigma_min) / float(n_steps_annealing) 17 | self.c = sigma 18 | self.sigma_min = sigma_min 19 | else: 20 | self.m = 0. 21 | self.c = sigma 22 | self.sigma_min = sigma 23 | 24 | @property 25 | def current_sigma(self): 26 | sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) 27 | return sigma 28 | 29 | 30 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess): 31 | def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1): 32 | super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 33 | self.size = size 34 | 35 | def sample(self): 36 | sample = np.random.normal(self.mu, self.current_sigma, self.size) 37 | self.n_steps += 1 38 | return sample 39 | 40 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 41 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess): 42 | def __init__(self, theta, mu=0., sigma=1., dt=1e-2, size=1, sigma_min=None, n_steps_annealing=1000): 43 | super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 44 | self.theta = theta 45 | self.mu = mu 46 | self.dt = dt 47 | self.size = size 48 | self.reset_states() 49 | 50 | def sample(self): 51 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size) 52 | self.x_prev = x 53 | self.n_steps += 1 54 | return x 55 | 56 | def reset_states(self): 57 | self.x_prev = np.random.normal(self.mu,self.current_sigma,self.size) -------------------------------------------------------------------------------- /Note/models/docs_example/DL/model1.py: -------------------------------------------------------------------------------- 1 | from Note import nn 2 | 3 | class Model(nn.Model): 4 | def __init__(self): 5 | super().__init__() 6 | self.conv1 = nn.conv2d(32, 3, activation='relu') 7 | self.flatten = nn.flatten() 8 | self.d1 = nn.dense(128, activation='relu') 9 | self.d2 = nn.dense(10) 10 | 11 | def __call__(self, x): 12 | x = self.conv1(x) 13 | x = self.flatten(x) 14 | x = self.d1(x) 15 | return self.d2(x) -------------------------------------------------------------------------------- /Note/models/docs_example/DL/model2.py: -------------------------------------------------------------------------------- 1 | from Note import nn 2 | 3 | class Model(nn.Model): 4 | def __init__(self): 5 | super().__init__() 6 | self.layers = nn.Sequential() 7 | self.layers.add(nn.conv2d(32, 3, activation='relu')) 8 | self.layers.add(nn.max_pool2d()) 9 | self.layers.add(nn.conv2d(64, 3, activation='relu')) 10 | self.layers.add(nn.max_pool2d()) 11 | self.layers.add(nn.flatten()) 12 | self.layers.add(nn.dense(64, activation='relu')) 13 | self.layers.add(nn.dense(10)) 14 | 15 | def __call__(self, x): 16 | return self.layers(x) -------------------------------------------------------------------------------- /Note/models/docs_example/DL/model3.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use Note's Adahessian optimizer 3 | by modifying the train_step function inherited from the Model class. 4 | """ 5 | import tensorflow as tf 6 | from Note import nn 7 | 8 | class Model(nn.Model): 9 | def __init__(self): 10 | super().__init__() 11 | self.layers = nn.Sequential() 12 | self.layers.add(nn.conv2d(32, 3, activation='relu')) 13 | self.layers.add(nn.max_pool2d()) 14 | self.layers.add(nn.conv2d(64, 3, activation='relu')) 15 | self.layers.add(nn.max_pool2d()) 16 | self.layers.add(nn.flatten()) 17 | self.layers.add(nn.dense(64, activation='relu')) 18 | self.layers.add(nn.dense(10)) 19 | 20 | def __call__(self, x): 21 | return self.layers(x) 22 | 23 | @tf.function(jit_compile=True) 24 | def train_step(self, train_data, labels, loss_object, train_loss, train_accuracy, optimizer): 25 | with tf.GradientTape() as tape: 26 | output = self.__call__(train_data) 27 | loss = loss_object(labels, output) 28 | gradients = tape.gradient(loss, self.param) 29 | optimizer.apply_gradients(zip(gradients, self.param), tape) 30 | train_loss(loss) 31 | if train_accuracy!=None: 32 | acc=train_accuracy(labels, output) 33 | return loss,acc 34 | return loss,None -------------------------------------------------------------------------------- /Note/models/docs_example/DL/model4.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use Note's PCGrad class 3 | by modifying the train_step function inherited from the Model class. 4 | """ 5 | import tensorflow as tf 6 | from Note import nn 7 | from Note.nn.optimizer.pcgrad import PCGrad 8 | # from Note.nn.optimizer.pcgrad import PPCGrad 9 | 10 | class Model(nn.Model): 11 | def __init__(self): 12 | super().__init__() 13 | self.layers = nn.Sequential() 14 | self.layers.add(nn.conv2d(32, 3, activation='relu')) 15 | self.layers.add(nn.max_pool2d()) 16 | self.layers.add(nn.conv2d(64, 3, activation='relu')) 17 | self.layers.add(nn.max_pool2d()) 18 | self.layers.add(nn.flatten()) 19 | self.layers.add(nn.dense(64, activation='relu')) 20 | self.layers.add(nn.dense(10)) 21 | self.pcgrad = PCGrad() 22 | # self.pcgrad = PPCGrad() 23 | 24 | def __call__(self, x): 25 | return self.layers(x) 26 | 27 | @tf.function(jit_compile=True) 28 | def train_step(self, train_data, labels, loss_object, train_loss, train_accuracy, optimizer): 29 | with tf.GradientTape(persistent=True) as tape: 30 | output = self.__call__(train_data) 31 | losses = loss_object(labels, output) 32 | gradients = self.pcgrad.pc_backward(tape, losses, self.param) 33 | optimizer.apply_gradients(zip(gradients, self.param), tape) 34 | loss = train_loss(losses) 35 | if train_accuracy!=None: 36 | acc=train_accuracy(labels, output) 37 | return loss,acc 38 | return loss,None -------------------------------------------------------------------------------- /Note/models/docs_example/DL/model5.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to use Note's LOMO class or AdaLOMO class 3 | by modifying the train_step function inherited from the Model class. 4 | """ 5 | import tensorflow as tf 6 | from Note import nn 7 | 8 | class Model(nn.Model): 9 | def __init__(self): 10 | super().__init__() 11 | self.layers = nn.Sequential() 12 | self.layers.add(nn.conv2d(32, 3, activation='relu')) 13 | self.layers.add(nn.max_pool2d()) 14 | self.layers.add(nn.conv2d(64, 3, activation='relu')) 15 | self.layers.add(nn.max_pool2d()) 16 | self.layers.add(nn.flatten()) 17 | self.layers.add(nn.dense(64, activation='relu')) 18 | self.layers.add(nn.dense(10)) 19 | 20 | def __call__(self, x): 21 | return self.layers(x) 22 | 23 | @tf.function(jit_compile=True) 24 | def train_step(self, train_data, labels, loss_object, train_loss, train_accuracy, optimizer): 25 | with tf.GradientTape(persistent=True) as tape: 26 | output = self.__call__(train_data) 27 | loss = loss_object(labels, output) 28 | if optimizer.clip_grad_norm is not None and optimizer.clip_grad_norm > 0.0: 29 | optimizer.grad_norm(tape, loss, self.param) 30 | optimizer.fused_backward(tape, loss, self.param, lr=optimizer.lr) 31 | loss = train_loss(loss) 32 | if train_accuracy!=None: 33 | acc=train_accuracy(labels, output) 34 | return loss,acc 35 | return loss,None -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/DDPG.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class actor(Model): 9 | def __init__(self,state_dim,hidden_dim,action_dim,action_bound): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim, activation='tanh')) 14 | self.action_bound=action_bound 15 | 16 | def __call__(self,x): 17 | x = self.model(x) 18 | return x*self.action_bound 19 | 20 | 21 | class critic(Model): 22 | def __init__(self,state_dim,hidden_dim,action_dim): 23 | super().__init__() 24 | self.model = Sequential() 25 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim+action_dim,), activation='relu')) 26 | self.model.add(tf.keras.layers.Dense(action_dim)) 27 | 28 | def __call__(self,x,a): 29 | cat=tf.concat([x,a],axis=1) 30 | x=self.model(cat) 31 | return x 32 | 33 | 34 | class DDPG(nn.RL): 35 | def __init__(self,hidden_dim,sigma,gamma,tau): 36 | super().__init__() 37 | self.env=gym.make('Pendulum-v1') 38 | state_dim=self.env.observation_space.shape[0] 39 | action_dim=self.env.action_space.shape[0] 40 | action_bound=self.env.action_space.high[0] 41 | self.actor=actor(state_dim,hidden_dim,action_dim,action_bound) 42 | self.critic=critic(state_dim,hidden_dim,action_dim) 43 | self.target_actor=actor(state_dim,hidden_dim,action_dim,action_bound) 44 | self.target_critic=critic(state_dim,hidden_dim,action_dim) 45 | nn.assign_param(self.target_actor.weights,self.actor.weights) 46 | nn.assign_param(self.target_critic.weights,self.critic.weights) 47 | self.param=[self.actor.weights,self.critic.weights] 48 | self.sigma=sigma 49 | self.gamma=gamma 50 | self.tau=tau 51 | 52 | def action(self,s): 53 | return self.actor(s) 54 | 55 | def __call__(self,s,a,next_s,r,d): 56 | a=tf.expand_dims(a,axis=1) 57 | next_q_value=self.target_critic(next_s,self.target_actor(next_s)) 58 | q_target=tf.cast(r,'float32')+self.gamma*next_q_value*(1-tf.cast(d,'float32')) 59 | actor_loss=-tf.reduce_mean(self.critic(s,self.actor(s))) 60 | critic_loss=tf.reduce_mean((self.critic(s,a)-q_target)**2) 61 | return actor_loss+critic_loss 62 | 63 | def update_param(self): 64 | for target_param,param in zip(self.target_actor.weights,self.actor.weights): 65 | target_param.assign(target_param*(1.0-self.tau)+param*self.tau) 66 | for target_param,param in zip(self.target_critic.weights,self.critic.weights): 67 | target_param.assign(target_param*(1.0-self.tau)+param*self.tau) 68 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class Qnet(Model): 9 | def __init__(self,state_dim, hidden_dim, action_dim): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim)) 14 | 15 | def __call__(self,x): 16 | x = self.model(x) 17 | return x 18 | 19 | 20 | class DQN(nn.RL): 21 | def __init__(self,state_dim,hidden_dim,action_dim): 22 | super().__init__() 23 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 24 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 25 | self.param=self.q_net.weights 26 | self.env=gym.make('CartPole-v0') 27 | 28 | def action(self,s): 29 | return self.q_net(s) 30 | 31 | def __call__(self,s,a,next_s,r,d): 32 | a=tf.expand_dims(a,axis=1) 33 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 34 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 35 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 36 | TD=(q_value-target) 37 | return tf.reduce_mean(TD**2) 38 | 39 | def update_param(self): 40 | nn.assign_param(self.target_q_net.weights,self.param) 41 | return 42 | -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/DQN_PR.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class Qnet(Model): 9 | def __init__(self,state_dim, hidden_dim, action_dim): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim)) 14 | 15 | def __call__(self,x): 16 | x = self.model(x) 17 | return x 18 | 19 | 20 | class DQN(nn.RL): 21 | def __init__(self,state_dim,hidden_dim,action_dim): 22 | super().__init__() 23 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 24 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 25 | self.param=self.q_net.weights 26 | self.env=gym.make('CartPole-v0') 27 | 28 | def action(self,s): 29 | return self.q_net(s) 30 | 31 | def __call__(self,s,a,next_s,r,d): 32 | a=tf.expand_dims(a,axis=1) 33 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 34 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 35 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 36 | TD=(q_value-target) 37 | self.prioritized_replay.update_TD(TD) 38 | return tf.reduce_mean(TD**2) 39 | 40 | def update_param(self): 41 | nn.assign_param(self.target_q_net.weights,self.param) 42 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/PPO.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class actor(Model): 9 | def __init__(self,state_dim,hidden_dim,action_dim): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim)) 14 | 15 | def __call__(self,x): 16 | x=self.model(x) 17 | return tf.nn.softmax(x) 18 | 19 | 20 | class critic(Model): 21 | def __init__(self,state_dim,hidden_dim): 22 | super().__init__() 23 | self.model = Sequential() 24 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 25 | self.model.add(tf.keras.layers.Dense(1)) 26 | 27 | def __call__(self,x): 28 | x=self.model(x) 29 | return x 30 | 31 | 32 | class PPO(nn.RL): 33 | def __init__(self,state_dim,hidden_dim,action_dim,clip_eps,alpha): 34 | super().__init__() 35 | self.actor=actor(state_dim,hidden_dim,action_dim) 36 | self.actor_old=actor(state_dim,hidden_dim,action_dim) 37 | nn.assign_param(self.actor_old.weights,self.actor.weights) 38 | self.critic=critic(state_dim,hidden_dim) 39 | self.clip_eps=clip_eps 40 | self.alpha=alpha 41 | self.param=[self.actor.weights,self.critic.weights] 42 | self.env=gym.make('CartPole-v0') 43 | 44 | def action(self,s): 45 | return self.actor_old(s) 46 | 47 | def __call__(self,s,a,next_s,r,d): 48 | a=tf.expand_dims(a,axis=1) 49 | action_prob=tf.gather(self.actor(s),a,axis=1,batch_dims=1) 50 | action_prob_old=tf.gather(self.actor_old(s),a,axis=1,batch_dims=1) 51 | raito=action_prob/action_prob_old 52 | value=self.critic(s) 53 | value_tar=tf.cast(r,'float32')+0.98*self.critic(next_s)*(1-tf.cast(d,'float32')) 54 | TD=value_tar-value 55 | sur1=raito*TD 56 | sur2=tf.clip_by_value(raito,clip_value_min=1-self.clip_eps,clip_value_max=1+self.clip_eps)*TD 57 | clip_loss=-tf.math.minimum(sur1,sur2) 58 | entropy=action_prob*tf.math.log(action_prob+1e-8) 59 | clip_loss=clip_loss-self.alpha*entropy 60 | return tf.reduce_mean(clip_loss)+tf.reduce_mean((TD)**2) 61 | 62 | def update_param(self): 63 | nn.assign_param(self.actor_old.weights, self.actor.weights) 64 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/pool_network/DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class Qnet(Model): 9 | def __init__(self,state_dim, hidden_dim, action_dim): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim)) 14 | 15 | def __call__(self,x): 16 | x = self.model(x) 17 | return x 18 | 19 | 20 | class DQN(nn.RL): 21 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 22 | super().__init__() 23 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 24 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 25 | self.param=self.q_net.weights 26 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 27 | 28 | def action(self,s): 29 | return self.q_net(s) 30 | 31 | def __call__(self,s,a,next_s,r,d): 32 | a=tf.expand_dims(a,axis=1) 33 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 34 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 35 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 36 | TD=(q_value-target) 37 | return tf.reduce_mean(TD**2) 38 | 39 | def update_param(self): 40 | nn.assign_param(self.target_q_net.weights,self.param) 41 | return 42 | -------------------------------------------------------------------------------- /Note/models/docs_example/RL/keras/pool_network/DQN_PR.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from keras.models import Sequential 4 | from keras import Model 5 | import gym 6 | 7 | 8 | class Qnet(Model): 9 | def __init__(self,state_dim, hidden_dim, action_dim): 10 | super().__init__() 11 | self.model = Sequential() 12 | self.model.add(tf.keras.layers.Dense(hidden_dim, input_shape=(state_dim,), activation='relu')) 13 | self.model.add(tf.keras.layers.Dense(action_dim)) 14 | 15 | def __call__(self,x): 16 | x = self.model(x) 17 | return x 18 | 19 | 20 | class DQN(nn.RL): 21 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 22 | super().__init__() 23 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 24 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 25 | self.param=self.q_net.weights 26 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 27 | 28 | def action(self,s): 29 | return self.q_net(s) 30 | 31 | def __call__(self,s,a,next_s,r,d): 32 | a=tf.expand_dims(a,axis=1) 33 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 34 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 35 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 36 | TD=(q_value-target) 37 | self.prioritized_replay.update_TD(TD) 38 | return tf.reduce_mean(TD**2) 39 | 40 | def update_param(self): 41 | nn.assign_param(self.target_q_net.weights,self.param) 42 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/DDPG.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class actor(nn.Model): 7 | def __init__(self,state_dim,hidden_dim,action_dim,action_bound): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim, activation='tanh') 11 | self.action_bound=action_bound 12 | 13 | def __call__(self,x): 14 | x = self.dense1(x) 15 | return self.dense2(x)*self.action_bound 16 | 17 | 18 | class critic(nn.Model): 19 | def __init__(self,state_dim,hidden_dim,action_dim): 20 | super().__init__() 21 | self.dense1 = nn.dense(hidden_dim, state_dim+action_dim, activation='relu') 22 | self.dense2 = nn.dense(action_dim, hidden_dim) 23 | 24 | def __call__(self,x,a): 25 | cat=tf.concat([x,a],axis=1) 26 | x=self.dense1(cat) 27 | return self.dense2(x) 28 | 29 | 30 | class DDPG(nn.RL): 31 | def __init__(self,hidden_dim,sigma,gamma,tau): 32 | super().__init__() 33 | self.env=gym.make('Pendulum-v1') 34 | state_dim=self.env.observation_space.shape[0] 35 | action_dim=self.env.action_space.shape[0] 36 | action_bound=self.env.action_space.high[0] 37 | self.actor=actor(state_dim,hidden_dim,action_dim,action_bound) 38 | self.critic=critic(state_dim,hidden_dim,action_dim) 39 | self.target_actor=actor(state_dim,hidden_dim,action_dim,action_bound) 40 | self.target_critic=critic(state_dim,hidden_dim,action_dim) 41 | nn.assign_param(self.target_actor.param,self.actor.param) 42 | nn.assign_param(self.target_critic.param,self.critic.param) 43 | self.param=[self.actor.param,self.critic.param] 44 | self.sigma=sigma 45 | self.gamma=gamma 46 | self.tau=tau 47 | 48 | def action(self,s): 49 | return self.actor(s) 50 | 51 | def __call__(self,s,a,next_s,r,d): 52 | a=tf.expand_dims(a,axis=1) 53 | next_q_value=self.target_critic(next_s,self.target_actor(next_s)) 54 | q_target=tf.cast(r,'float32')+self.gamma*next_q_value*(1-tf.cast(d,'float32')) 55 | actor_loss=-tf.reduce_mean(self.critic(s,self.actor(s))) 56 | critic_loss=tf.reduce_mean((self.critic(s,a)-q_target)**2) 57 | return actor_loss+critic_loss 58 | 59 | def update_param(self): 60 | for target_param,param in zip(self.target_actor.param,self.actor.param): 61 | target_param.assign(target_param*(1.0-self.tau)+param*self.tau) 62 | for target_param,param in zip(self.target_critic.param,self.critic.param): 63 | target_param.assign(target_param*(1.0-self.tau)+param*self.tau) 64 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class Qnet(nn.Model): 7 | def __init__(self,state_dim, hidden_dim, action_dim): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim) 11 | 12 | def __call__(self,x): 13 | x = self.dense2(self.dense1(x)) 14 | return x 15 | 16 | 17 | class DQN(nn.RL): 18 | def __init__(self,state_dim,hidden_dim,action_dim): 19 | super().__init__() 20 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 21 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 22 | self.param=self.q_net.param 23 | self.env=gym.make('CartPole-v0') 24 | 25 | def action(self,s): 26 | return self.q_net(s) 27 | 28 | def __call__(self,s,a,next_s,r,d): 29 | a=tf.expand_dims(a,axis=1) 30 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 31 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 32 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 33 | TD=(q_value-target) 34 | return tf.reduce_mean(TD**2) 35 | 36 | def update_param(self): 37 | nn.assign_param(self.target_q_net.param,self.param) 38 | return 39 | -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/DQN_PR.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class Qnet(nn.Model): 7 | def __init__(self,state_dim, hidden_dim, action_dim): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim) 11 | 12 | def __call__(self,x): 13 | x = self.dense2(self.dense1(x)) 14 | return x 15 | 16 | 17 | class DQN(nn.RL): 18 | def __init__(self,state_dim,hidden_dim,action_dim): 19 | super().__init__() 20 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 21 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 22 | self.param=self.q_net.param 23 | self.env=gym.make('CartPole-v0') 24 | 25 | def action(self,s): 26 | return self.q_net(s) 27 | 28 | def __call__(self,s,a,next_s,r,d): 29 | a=tf.expand_dims(a,axis=1) 30 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 31 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 32 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 33 | TD=(q_value-target) 34 | self.prioritized_replay.update_TD(TD) 35 | return tf.reduce_mean(TD**2) 36 | 37 | def update_param(self): 38 | nn.assign_param(self.target_q_net.param,self.param) 39 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/PPO.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class actor(nn.Model): 7 | def __init__(self,state_dim,hidden_dim,action_dim): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim) 11 | 12 | def __call__(self,x): 13 | x=self.dense1(x) 14 | return tf.nn.softmax(self.dense2(x)) 15 | 16 | 17 | class critic(nn.Model): 18 | def __init__(self,state_dim,hidden_dim): 19 | super().__init__() 20 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 21 | self.dense2 = nn.dense(1, hidden_dim) 22 | 23 | def __call__(self,x): 24 | x=self.dense1(x) 25 | return self.dense2(x) 26 | 27 | 28 | class PPO(nn.RL): 29 | def __init__(self,state_dim,hidden_dim,action_dim,clip_eps,alpha): 30 | super().__init__() 31 | self.actor=actor(state_dim,hidden_dim,action_dim) 32 | self.actor_old=actor(state_dim,hidden_dim,action_dim) 33 | nn.assign_param(self.actor_old.param,self.actor.param) 34 | self.critic=critic(state_dim,hidden_dim) 35 | self.clip_eps=clip_eps 36 | self.alpha=alpha 37 | self.param=[self.actor.param,self.critic.param] 38 | self.env=gym.make('CartPole-v0') 39 | 40 | def action(self,s): 41 | return self.actor_old(s) 42 | 43 | def __call__(self,s,a,next_s,r,d): 44 | a=tf.expand_dims(a,axis=1) 45 | action_prob=tf.gather(self.actor(s),a,axis=1,batch_dims=1) 46 | action_prob_old=tf.gather(self.actor_old(s),a,axis=1,batch_dims=1) 47 | raito=action_prob/action_prob_old 48 | value=self.critic(s) 49 | value_tar=tf.cast(r,'float32')+0.98*self.critic(next_s)*(1-tf.cast(d,'float32')) 50 | TD=value_tar-value 51 | sur1=raito*TD 52 | sur2=tf.clip_by_value(raito,clip_value_min=1-self.clip_eps,clip_value_max=1+self.clip_eps)*TD 53 | clip_loss=-tf.math.minimum(sur1,sur2) 54 | entropy=action_prob*tf.math.log(action_prob+1e-8) 55 | clip_loss=clip_loss-self.alpha*entropy 56 | return tf.reduce_mean(clip_loss)+tf.reduce_mean((TD)**2) 57 | 58 | def update_param(self): 59 | nn.assign_param(self.actor_old.param, self.actor.param) 60 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/Rainbow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from Note import nn 4 | import gym 5 | 6 | 7 | class NoisyLinear: 8 | def __init__(self, out_dim, in_dim, sigma_init=0.017): 9 | self.in_dim = in_dim 10 | self.out_dim = out_dim 11 | 12 | self.mu_w = nn.initializer([in_dim, out_dim], ['uniform',-1/np.sqrt(in_dim),1/np.sqrt(in_dim)]) 13 | self.sigma_w = nn.Parameter(tf.ones([in_dim, out_dim])*sigma_init) 14 | self.mu_b = nn.initializer([out_dim], ['uniform',-1/np.sqrt(in_dim),1/np.sqrt(in_dim)]) 15 | self.sigma_b = nn.Parameter(tf.ones([out_dim])*sigma_init) 16 | 17 | def __call__(self, x): 18 | epsilon_in = self._scale_noise(self.in_dim) 19 | epsilon_out = self._scale_noise(self.out_dim) 20 | 21 | w_noise = tf.multiply(self.sigma_w, tf.einsum('i,j->ij', epsilon_in, epsilon_out)) 22 | b_noise = tf.multiply(self.sigma_b, epsilon_out) 23 | 24 | return tf.matmul(x, self.mu_w + w_noise) + (self.mu_b + b_noise) 25 | 26 | def _scale_noise(self, size): 27 | x = tf.random.normal([size]) 28 | return tf.sign(x) * tf.sqrt(tf.abs(x)) 29 | 30 | 31 | class VAnet(nn.Model): 32 | def __init__(self,state_dim,hidden_dim,action_dim): 33 | super().__init__() 34 | self.fc1=NoisyLinear(hidden_dim,state_dim) 35 | self.fc_A=NoisyLinear(action_dim,hidden_dim) 36 | self.fc_V=NoisyLinear(1,hidden_dim) 37 | 38 | def __call__(self,x): 39 | A=self.fc_A(tf.nn.relu(self.fc1(x))) 40 | V=self.fc_V(tf.nn.relu(self.fc1(x))) 41 | Q=V+A-tf.expand_dims(tf.reduce_mean(A,axis=1),axis=1) 42 | return Q 43 | 44 | 45 | class Rainbow(nn.RL): 46 | def __init__(self,state_dim,hidden_dim,action_dim): 47 | super().__init__() 48 | self.va_net=VAnet(state_dim,hidden_dim,action_dim) 49 | self.target_q_net=VAnet(state_dim,hidden_dim,action_dim) 50 | self.param=self.va_net.param 51 | self.genv=gym.make('CartPole-v0') 52 | 53 | def action(self,s): 54 | return self.va_net(s) 55 | 56 | def loss(self,s,a,next_s,r,d): 57 | a=tf.expand_dims(a,axis=1) 58 | q_value=tf.gather(self.va_net(s),a,axis=1,batch_dims=1) 59 | max_action=tf.expand_dims(tf.argmax(self.va_net(s),axis=1),axis=1) 60 | next_q_value=tf.gather(self.target_q_net(next_s),max_action,axis=1,batch_dims=1) 61 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 62 | TD=(q_value-target) 63 | self.prioritized_replay.update_TD(TD) 64 | return tf.reduce_mean(TD**2) 65 | 66 | def update_param(self): 67 | nn.assign_param(self.target_q_net.param,self.param) 68 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/pool_network/DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class Qnet(nn.Model): 7 | def __init__(self,state_dim, hidden_dim, action_dim): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim) 11 | 12 | def __call__(self,x): 13 | x = self.dense2(self.dense1(x)) 14 | return x 15 | 16 | 17 | class DQN(nn.RL): 18 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 19 | super().__init__() 20 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 21 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 22 | self.param=self.q_net.param 23 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 24 | 25 | def action(self,s): 26 | return self.q_net(s) 27 | 28 | def __call__(self,s,a,next_s,r,d): 29 | a=tf.expand_dims(a,axis=1) 30 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 31 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 32 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 33 | return tf.reduce_mean((q_value-target)**2) 34 | 35 | def update_param(self): 36 | nn.assign_param(self.target_q_net.param,self.param) 37 | return 38 | -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/pool_network/DQN_PR.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import gym 4 | 5 | 6 | class Qnet(nn.Model): 7 | def __init__(self,state_dim, hidden_dim, action_dim): 8 | super().__init__() 9 | self.dense1 = nn.dense(hidden_dim, state_dim, activation='relu') 10 | self.dense2 = nn.dense(action_dim, hidden_dim) 11 | 12 | def __call__(self,x): 13 | x = self.dense2(self.dense1(x)) 14 | return x 15 | 16 | 17 | class DQN(nn.RL): 18 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 19 | super().__init__() 20 | self.q_net=Qnet(state_dim,hidden_dim,action_dim) 21 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim) 22 | self.param=self.q_net.param 23 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 24 | 25 | def action(self,s): 26 | return self.q_net(s) 27 | 28 | def __call__(self,s,a,next_s,r,d): 29 | a=tf.expand_dims(a,axis=1) 30 | q_value=tf.gather(self.q_net(s),a,axis=1,batch_dims=1) 31 | next_q_value=tf.reduce_max(self.target_q_net(next_s),axis=1) 32 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 33 | self.prioritized_replay.update_TD(target) 34 | return tf.reduce_mean((q_value-target)**2) 35 | 36 | def update_param(self): 37 | nn.assign_param(self.target_q_net.param,self.param) 38 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/note/pool_network/Rainbow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from Note import nn 4 | import gym 5 | 6 | 7 | class NoisyLinear: 8 | def __init__(self, out_dim, in_dim, sigma_init=0.017): 9 | self.in_dim = in_dim 10 | self.out_dim = out_dim 11 | 12 | self.mu_w = nn.initializer([in_dim, out_dim], ['uniform',-1/np.sqrt(in_dim),1/np.sqrt(in_dim)]) 13 | self.sigma_w = nn.Parameter(tf.ones([in_dim, out_dim])*sigma_init) 14 | self.mu_b = nn.initializer([out_dim], ['uniform',-1/np.sqrt(in_dim),1/np.sqrt(in_dim)]) 15 | self.sigma_b = nn.Parameter(tf.ones([out_dim])*sigma_init) 16 | 17 | def __call__(self, x): 18 | epsilon_in = self._scale_noise(self.in_dim) 19 | epsilon_out = self._scale_noise(self.out_dim) 20 | 21 | w_noise = tf.multiply(self.sigma_w, tf.einsum('i,j->ij', epsilon_in, epsilon_out)) 22 | b_noise = tf.multiply(self.sigma_b, epsilon_out) 23 | 24 | return tf.matmul(x, self.mu_w + w_noise) + (self.mu_b + b_noise) 25 | 26 | def _scale_noise(self, size): 27 | x = tf.random.normal([size]) 28 | return tf.sign(x) * tf.sqrt(tf.abs(x)) 29 | 30 | 31 | class VAnet(nn.Model): 32 | def __init__(self,state_dim,hidden_dim,action_dim): 33 | super().__init__() 34 | self.fc1=NoisyLinear(hidden_dim,state_dim) 35 | self.fc_A=NoisyLinear(action_dim,hidden_dim) 36 | self.fc_V=NoisyLinear(1,hidden_dim) 37 | 38 | def __call__(self,x): 39 | A=self.fc_A(tf.nn.relu(self.fc1(x))) 40 | V=self.fc_V(tf.nn.relu(self.fc1(x))) 41 | Q=V+A-tf.expand_dims(tf.reduce_mean(A,axis=1),axis=1) 42 | return Q 43 | 44 | 45 | class Rainbow(nn.RL): 46 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 47 | super().__init__() 48 | self.va_net=VAnet(state_dim,hidden_dim,action_dim) 49 | self.target_q_net=VAnet(state_dim,hidden_dim,action_dim) 50 | self.param=self.va_net.param 51 | self.genv=[gym.make('CartPole-v0') for _ in range(processes)] 52 | 53 | def action(self,s): 54 | return self.va_net(s) 55 | 56 | def loss(self,s,a,next_s,r,d): 57 | a=tf.expand_dims(a,axis=1) 58 | q_value=tf.gather(self.va_net(s),a,axis=1,batch_dims=1) 59 | max_action=tf.expand_dims(tf.argmax(self.va_net(s),axis=1),axis=1) 60 | next_q_value=tf.gather(self.target_q_net(next_s),max_action,axis=1,batch_dims=1) 61 | target=tf.cast(r,'float32')+0.98*next_q_value*(1-tf.cast(d,'float32')) 62 | TD=(q_value-target) 63 | self.prioritized_replay.update_TD(TD) 64 | return tf.reduce_mean(TD**2) 65 | 66 | def update_param(self): 67 | nn.assign_param(self.target_q_net.param,self.param) 68 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class Qnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(Qnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc2=torch.nn.Linear(hidden_dim,action_dim) 12 | 13 | def forward(self,x): 14 | x=F.relu(self.fc1(x)) 15 | return self.fc2(x) 16 | 17 | 18 | class DQN(nn.RL_pytorch): 19 | def __init__(self,state_dim,hidden_dim,action_dim): 20 | super().__init__() 21 | if torch.cuda.is_available(): 22 | self.device=torch.device('cuda') 23 | else: 24 | self.device=torch.device('cpu') 25 | self.q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 26 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 27 | self.param=self.q_net.parameters() 28 | self.env=gym.make('CartPole-v0') #create environment 29 | 30 | def action(self,s): 31 | return self.q_net(s) 32 | 33 | def __call__(self,s,a,next_s,r,d): #loss function,kernel uses it to calculate loss. 34 | s=torch.tensor(s,dtype=torch.float).to(self.device) 35 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 36 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 37 | r=torch.tensor(r,dtype=torch.float).to(self.device) 38 | d=torch.tensor(d,dtype=torch.float).to(self.device) 39 | q_value=self.q_net(s).gather(1,a) 40 | next_q_value=self.target_q_net(next_s).max(1)[0] 41 | target=r+0.98*next_q_value*(1-d) 42 | return F.mse_loss(q_value,target) 43 | 44 | def update_param(self): #update function,kernel uses it to update parameter. 45 | self.target_q_net.load_state_dict(self.q_net.state_dict()) 46 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/DoubleDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class Qnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(Qnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc2=torch.nn.Linear(hidden_dim,action_dim) 12 | 13 | def forward(self,x): 14 | x=F.relu(self.fc1(x)) 15 | return self.fc2(x) 16 | 17 | 18 | class DoubleDQN(nn.RL_pytorch): 19 | def __init__(self,state_dim,hidden_dim,action_dim): 20 | super().__init__() 21 | if torch.cuda.is_available(): 22 | self.device=torch.device('cuda') 23 | else: 24 | self.device=torch.device('cpu') 25 | self.q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 26 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 27 | self.param=self.q_net.parameters() 28 | self.env=gym.make('CartPole-v0') 29 | 30 | def action(self,s): 31 | return self.q_net(s) 32 | 33 | def __call__(self,s,a,next_s,r,d): 34 | s=torch.tensor(s,dtype=torch.float).to(self.device) 35 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 36 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 37 | r=torch.tensor(r,dtype=torch.float).view(-1,1).to(self.device) 38 | d=torch.tensor(d,dtype=torch.float).view(-1,1).to(self.device) 39 | q_value=self.q_net(s).gather(1,a) 40 | max_action=self.q_net(next_s).max(1)[1].view(-1,1) 41 | next_q_value=self.target_q_net(next_s).gather(1,max_action) 42 | target=r+0.98*next_q_value*(1-d) 43 | return F.mse_loss(q_value,target) 44 | 45 | def update_param(self): 46 | self.target_q_net.load_state_dict(self.nn.state_dict()) 47 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/DuelingDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class VAnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(VAnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc_A=torch.nn.Linear(hidden_dim,action_dim) 12 | self.fc_V=torch.nn.Linear(hidden_dim,1) 13 | 14 | def forward(self,x): 15 | A=self.fc_A(F.relu(self.fc1(x))) 16 | V=self.fc_V(F.relu(self.fc1(x))) 17 | Q=V+A-A.mean(1).view(-1,1) 18 | return Q 19 | 20 | 21 | class DuelingDQN(nn.RL_pytorch): 22 | def __init__(self,state_dim,hidden_dim,action_dim): 23 | super().__init__() 24 | if torch.cuda.is_available(): 25 | self.device=torch.device('cuda') 26 | else: 27 | self.device=torch.device('cpu') 28 | self.va_net=VAnet(state_dim,hidden_dim,action_dim).to(self.device) 29 | self.target_q_net=VAnet(state_dim,hidden_dim,action_dim).to(self.device) 30 | self.optimizer=torch.optim.Adam(self.nn.parameters(),lr=2e-3) 31 | self.genv=gym.make('CartPole-v0') 32 | 33 | def action(self,s): 34 | return self.va_net(s) 35 | 36 | def loss(self,s,a,next_s,r,d): 37 | s=torch.tensor(s,dtype=torch.float).to(self.device) 38 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 39 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 40 | r=torch.tensor(r,dtype=torch.float).view(-1,1).to(self.device) 41 | d=torch.tensor(d,dtype=torch.float).view(-1,1).to(self.device) 42 | q_value=self.va_net(s).gather(1,a) 43 | next_q_value=self.target_q_net(next_s).max(1)[0].view(-1,1) 44 | target=r+0.98*next_q_value*(1-d) 45 | return F.mse_loss(q_value,target) 46 | 47 | def update_param(self): 48 | self.target_q_net.load_state_dict(self.nn.state_dict()) 49 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/pool_network/DQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class Qnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(Qnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc2=torch.nn.Linear(hidden_dim,action_dim) 12 | 13 | def forward(self,x): 14 | x=F.relu(self.fc1(x)) 15 | return self.fc2(x) 16 | 17 | 18 | class DQN(nn.RL_pytorch): 19 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 20 | super().__init__() 21 | if torch.cuda.is_available(): 22 | self.device=torch.device('cuda') 23 | else: 24 | self.device=torch.device('cpu') 25 | self.q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 26 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 27 | self.param=self.q_net.parameters() 28 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] #create environment 29 | 30 | def action(self,s): 31 | return self.q_net(s) 32 | 33 | def __call__(self,s,a,next_s,r,d): #loss function,kernel uses it to calculate loss. 34 | s=torch.tensor(s,dtype=torch.float).to(self.device) 35 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 36 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 37 | r=torch.tensor(r,dtype=torch.float).to(self.device) 38 | d=torch.tensor(d,dtype=torch.float).to(self.device) 39 | q_value=self.q_net(s).gather(1,a) 40 | next_q_value=self.target_q_net(next_s).max(1)[0] 41 | target=r+0.98*next_q_value*(1-d) 42 | return F.mse_loss(q_value,target) 43 | 44 | def update_param(self): #update function,kernel uses it to update parameter. 45 | self.target_q_net.load_state_dict(self.q_net.state_dict()) 46 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/pool_network/DoubleDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class Qnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(Qnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc2=torch.nn.Linear(hidden_dim,action_dim) 12 | 13 | def forward(self,x): 14 | x=F.relu(self.fc1(x)) 15 | return self.fc2(x) 16 | 17 | 18 | class DoubleDQN(nn.RL_pytorch): 19 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 20 | super().__init__() 21 | if torch.cuda.is_available(): 22 | self.device=torch.device('cuda') 23 | else: 24 | self.device=torch.device('cpu') 25 | self.q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 26 | self.target_q_net=Qnet(state_dim,hidden_dim,action_dim).to(self.device) 27 | self.param=self.q_net.parameters() 28 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 29 | 30 | def action(self,s): 31 | return self.q_net(s) 32 | 33 | def __call__(self,s,a,next_s,r,d): 34 | s=torch.tensor(s,dtype=torch.float).to(self.device) 35 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 36 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 37 | r=torch.tensor(r,dtype=torch.float).view(-1,1).to(self.device) 38 | d=torch.tensor(d,dtype=torch.float).view(-1,1).to(self.device) 39 | q_value=self.q_net(s).gather(1,a) 40 | max_action=self.q_net(next_s).max(1)[1].view(-1,1) 41 | next_q_value=self.target_q_net(next_s).gather(1,max_action) 42 | target=r+0.98*next_q_value*(1-d) 43 | return F.mse_loss(q_value,target) 44 | 45 | def update_param(self): 46 | self.target_q_net.load_state_dict(self.nn.state_dict()) 47 | return -------------------------------------------------------------------------------- /Note/models/docs_example/RL/pytorch/pool_network/DuelingDQN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from Note import nn 3 | import gym 4 | import torch.nn.functional as F 5 | 6 | 7 | class VAnet(torch.nn.Module): 8 | def __init__(self,state_dim,hidden_dim,action_dim): 9 | super(VAnet,self).__init__() 10 | self.fc1=torch.nn.Linear(state_dim,hidden_dim) 11 | self.fc_A=torch.nn.Linear(hidden_dim,action_dim) 12 | self.fc_V=torch.nn.Linear(hidden_dim,1) 13 | 14 | def forward(self,x): 15 | A=self.fc_A(F.relu(self.fc1(x))) 16 | V=self.fc_V(F.relu(self.fc1(x))) 17 | Q=V+A-A.mean(1).view(-1,1) 18 | return Q 19 | 20 | 21 | class DuelingDQN(nn.RL_pytorch): 22 | def __init__(self,state_dim,hidden_dim,action_dim,processes): 23 | super().__init__() 24 | if torch.cuda.is_available(): 25 | self.device=torch.device('cuda') 26 | else: 27 | self.device=torch.device('cpu') 28 | self.va_net=VAnet(state_dim,hidden_dim,action_dim).to(self.device) 29 | self.target_q_net=VAnet(state_dim,hidden_dim,action_dim).to(self.device) 30 | self.optimizer=torch.optim.Adam(self.nn.parameters(),lr=2e-3) 31 | self.env=[gym.make('CartPole-v0') for _ in range(processes)] 32 | 33 | def action(self,s): 34 | return self.va_net(s) 35 | 36 | def loss(self,s,a,next_s,r,d): 37 | s=torch.tensor(s,dtype=torch.float).to(self.device) 38 | a=torch.tensor(a,dtype=torch.int64).view(-1,1).to(self.device) 39 | next_s=torch.tensor(next_s,dtype=torch.float).to(self.device) 40 | r=torch.tensor(r,dtype=torch.float).view(-1,1).to(self.device) 41 | d=torch.tensor(d,dtype=torch.float).view(-1,1).to(self.device) 42 | q_value=self.va_net(s).gather(1,a) 43 | next_q_value=self.target_q_net(next_s).max(1)[0].view(-1,1) 44 | target=r+0.98*next_q_value*(1-d) 45 | return F.mse_loss(q_value,target) 46 | 47 | def update_param(self): 48 | self.target_q_net.load_state_dict(self.nn.state_dict()) 49 | return -------------------------------------------------------------------------------- /Note/models/tf/VGG19.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class VGG19(nn.Model): 6 | def __init__(self,include_top=True,pooling=None,classes=1000): 7 | super().__init__() 8 | self.include_top=include_top 9 | self.pooling=pooling 10 | self.classes=classes 11 | 12 | self.layers=nn.Sequential() 13 | # Block 1 14 | self.layers.add(nn.conv2d(64,(3,3),3,activation="relu", padding="SAME")) 15 | self.layers.add(nn.conv2d(64,(3,3),activation="relu", padding="SAME")) 16 | self.layers.add(nn.max_pool2d((2, 2), strides=(2, 2), padding='VALID')) 17 | # Block 2 18 | self.layers.add(nn.conv2d(128,(3,3),activation="relu", padding="SAME")) 19 | self.layers.add(nn.conv2d(128,(3,3),activation="relu", padding="SAME")) 20 | self.layers.add(nn.max_pool2d((2, 2), strides=(2, 2), padding='VALID')) 21 | # Block 3 22 | self.layers.add(nn.conv2d(256,(3,3),activation="relu", padding="SAME")) 23 | self.layers.add(nn.conv2d(256,(3,3),activation="relu", padding="SAME")) 24 | self.layers.add(nn.conv2d(256,(3,3),activation="relu", padding="SAME")) 25 | self.layers.add(nn.conv2d(256,(3,3),activation="relu", padding="SAME")) 26 | self.layers.add(nn.max_pool2d((2, 2), strides=(2, 2), padding='VALID')) 27 | # Block 4 28 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 29 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 30 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 31 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 32 | self.layers.add(nn.max_pool2d((2, 2), strides=(2, 2), padding='VALID')) 33 | # Block 5 34 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 35 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 36 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 37 | self.layers.add(nn.conv2d(512,(3,3),activation="relu", padding="SAME")) 38 | self.layers.add(nn.max_pool2d((2, 2), strides=(2, 2), padding='VALID')) 39 | 40 | self.flatten=nn.flatten 41 | self.dense1=nn.dense(4096,25088,activation='relu') 42 | self.dense2=nn.dense(4096,self.dense1.output_size,activation='relu') 43 | self.head=self.dense(self.classes,self.dense2.output_size) 44 | 45 | 46 | def __call__(self,data): 47 | x=self.layers(data) 48 | if self.include_top: 49 | x=self.flatten(x) 50 | x=self.dense1(x) 51 | x=self.dense2(x) 52 | x=self.head(x) 53 | else: 54 | if self.pooling=="avg": 55 | data = tf.math.reduce_mean(data, axis=[1, 2]) 56 | elif self.pooling=="max": 57 | data = tf.math.reduce_max(data, axis=[1, 2]) 58 | return x -------------------------------------------------------------------------------- /Note/nn/Sequential.py: -------------------------------------------------------------------------------- 1 | class Sequential: 2 | def __init__(self): 3 | self.layer=[] 4 | self.param=[] 5 | self.saved_data=[] 6 | self.save_data_flag=[] 7 | self.use_data_flag=[] 8 | self.save_data_count=0 9 | self.output_size=None 10 | self.train_flag=True 11 | 12 | 13 | def add(self,layer,save_data=False,use_data=False): 14 | if type(layer)!=list: 15 | if save_data==True: 16 | self.save_data_count+=1 17 | if use_data==True and hasattr(layer,'save_data_count'): 18 | layer.save_data_count=self.save_data_count 19 | if use_data==True: 20 | self.save_data_count=0 21 | self.layer.append(layer) 22 | if hasattr(layer,'param'): 23 | self.param.extend(layer.param) 24 | if hasattr(layer,'output_size'): 25 | self.output_size=layer.output_size 26 | self.save_data_flag.append(save_data) 27 | self.use_data_flag.append(use_data) 28 | else: 29 | for layer in layer: 30 | self.layer.append(layer) 31 | if hasattr(layer,'param'): 32 | self.param.extend(layer.param) 33 | if hasattr(layer,'output_size'): 34 | self.output_size=layer.output_size 35 | return 36 | 37 | 38 | def __call__(self,data,training=True): 39 | for i,layer in enumerate(self.layer): 40 | if not hasattr(layer,'train_flag'): 41 | if len(self.use_data_flag)==0 or self.use_data_flag[i]==False: 42 | data=layer(data) 43 | else: 44 | if hasattr(layer,'save_data_count'): 45 | data=layer(self.saved_data) 46 | else: 47 | data=layer(data,self.saved_data.pop(0)) 48 | else: 49 | if len(self.use_data_flag)==0 or self.use_data_flag[i]==False: 50 | data=layer(data,training) 51 | else: 52 | if hasattr(layer,'save_data_count'): 53 | data=layer(self.saved_data,training) 54 | else: 55 | data=layer(data,self.saved_data.pop(0),training) 56 | if len(self.save_data_flag)>0 and self.save_data_flag[i]==True: 57 | self.saved_data.append(data) 58 | return data 59 | -------------------------------------------------------------------------------- /Note/nn/assign_param.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import state_ops 2 | from tensorflow.python.util import nest 3 | 4 | 5 | def assign_param(param1,param2): 6 | parameter_flat1=nest.flatten(param1) 7 | parameter_flat2=nest.flatten(param2) 8 | for i in range(len(parameter_flat1)): 9 | state_ops.assign(parameter_flat1[i],parameter_flat2[i]) 10 | return 11 | -------------------------------------------------------------------------------- /Note/nn/coalesce_sparse.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def coalesce_sparse(sp: tf.SparseTensor) -> tf.SparseTensor: 5 | dense_shape = tf.cast(sp.dense_shape, tf.int64) 6 | multipliers = tf.concat([ 7 | tf.math.cumprod(dense_shape[1:], reverse=False), 8 | tf.constant([1], dtype=tf.int64) 9 | ], axis=0) 10 | linear_idx = tf.reduce_sum(sp.indices * multipliers, axis=1) 11 | unique_idx, segment_ids = tf.unique(linear_idx) 12 | summed_vals = tf.math.unsorted_segment_sum( 13 | sp.values, segment_ids, tf.shape(unique_idx)[0] 14 | ) 15 | unraveled = tf.unravel_index(unique_idx, sp.dense_shape) 16 | new_indices = tf.stack(unraveled, axis=1) 17 | return tf.SparseTensor(new_indices, summed_vals, sp.dense_shape) -------------------------------------------------------------------------------- /Note/nn/conv2d_func.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | def conv2d_func(input, weight, bias=None, strides=1, padding=0, dilations=1, groups=1): 6 | if not isinstance(padding,str): 7 | x = nn.zeropadding2d(padding=padding)(input) 8 | padding = 'VALID' 9 | if groups == 1: 10 | if bias: 11 | x = tf.nn.conv2d(x, weight, strides, padding, dilations=dilations) + bias 12 | else: 13 | x = tf.nn.conv2d(x, weight, strides, padding, dilations=dilations) 14 | else: 15 | input_groups = tf.split(input, num_or_size_splits=groups, axis=-1) 16 | weight_groups = tf.split(weight, num_or_size_splits=groups, axis=-1) 17 | output_groups = [] 18 | for i in range(groups): 19 | x = tf.nn.conv2d(input_groups[i], weight_groups[i], strides, padding, dilations=dilations) 20 | output_groups.append(x) 21 | x = tf.concat(output_groups, axis=-1) 22 | if bias: 23 | x = x + bias 24 | return x -------------------------------------------------------------------------------- /Note/nn/cosine_similarity.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def cosine_similarity(x1, x2, axis=1, eps=1e-8): 5 | w12 = tf.reduce_sum(tf.multiply(x1, x2), axis=axis) 6 | w1 = tf.reduce_sum(tf.multiply(x1, x1), axis=axis) 7 | w2 = tf.reduce_sum(tf.multiply(x2, x2), axis=axis) 8 | n12 = tf.sqrt(clip(w1 * w2, eps * eps)) 9 | cos_sim = w12 / n12 10 | return cos_sim 11 | 12 | def clip(x, min): 13 | x_dtype = x.dtype 14 | if x_dtype == tf.int32: 15 | max = np.iinfo(np.int32).max - 2**7 16 | elif x_dtype == tf.int64: 17 | max = np.iinfo(np.int64).max - 2**39 18 | elif x_dtype == tf.float16: 19 | max = float(np.finfo(np.float16).max) 20 | else: 21 | max = float(np.finfo(np.float32).max) 22 | 23 | return tf.clip_by_value(x, min, max) -------------------------------------------------------------------------------- /Note/nn/create_additive_causal_mask.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def create_additive_causal_mask(N, dtype = tf.float32): 4 | indices = tf.range(N) 5 | mask = indices[:, None] < indices[None] 6 | # usually inf but 1e9 is as good and softmax(full(1e9)) != nan 7 | # TODO: Should replace this with finfo(dtype).min 8 | mask = tf.cast(mask, dtype) * -1e9 9 | return mask -------------------------------------------------------------------------------- /Note/nn/gather_mm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def gather_mm(a, b, idx_b): 4 | """ 5 | Gather data according to the given indices and perform matrix multiplication. 6 | 7 | Parameters 8 | ---------- 9 | a : tf.Tensor 10 | A 3-D tensor of shape (N, M, D1) or a 2-D tensor of shape (N, D1) 11 | b : tf.Tensor 12 | A 3-D tensor of shape (R, D1, D2) 13 | idx_b : tf.Tensor, optional 14 | A 1-D integer tensor of shape (N,) 15 | 16 | Returns 17 | ------- 18 | tf.Tensor 19 | The output dense matrix of shape (N, M, D2) if a is 3-D, or (N, D2) if a is 2-D 20 | """ 21 | # Gather the appropriate slices from b according to idx_b 22 | gathered_b = tf.gather(b, idx_b) 23 | 24 | # If a is 2-D, expand its dimensions to 3-D for consistent batch matrix multiplication 25 | if len(a.shape) == 2: 26 | a = tf.expand_dims(a, axis=1) # Shape becomes (N, 1, D1) 27 | expanded = True 28 | else: 29 | expanded = False 30 | 31 | # Perform the batch matrix multiplication 32 | result = tf.einsum('nij,njk->nik', a, gathered_b) 33 | 34 | # If a was originally 2-D, squeeze the extra dimension 35 | if expanded: 36 | result = tf.squeeze(result, axis=1) # Shape becomes (N, D2) 37 | 38 | return result 39 | -------------------------------------------------------------------------------- /Note/nn/helpers.py: -------------------------------------------------------------------------------- 1 | """ Layer/Module Helpers 2 | 3 | Hacked together by / Copyright 2020 Ross Wightman 4 | """ 5 | from itertools import repeat 6 | import collections.abc 7 | 8 | 9 | # From PyTorch internals 10 | def _ntuple(n): 11 | def parse(x): 12 | if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): 13 | return tuple(x) 14 | return tuple(repeat(x, n)) 15 | return parse 16 | 17 | 18 | to_1tuple = _ntuple(1) 19 | to_2tuple = _ntuple(2) 20 | to_3tuple = _ntuple(3) 21 | to_4tuple = _ntuple(4) 22 | to_ntuple = _ntuple 23 | 24 | 25 | def make_divisible(v, divisor=8, min_value=None, round_limit=.9): 26 | min_value = min_value or divisor 27 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 28 | # Make sure that round down does not go down by more than 10%. 29 | if new_v < round_limit * v: 30 | new_v += divisor 31 | return new_v 32 | 33 | 34 | def extend_tuple(x, n): 35 | # pads a tuple to specified n by padding with last value 36 | if not isinstance(x, (tuple, list)): 37 | x = (x,) 38 | else: 39 | x = tuple(x) 40 | pad_n = n - len(x) 41 | if pad_n <= 0: 42 | return x[:n] 43 | return x + (x[-1],) * pad_n 44 | -------------------------------------------------------------------------------- /Note/nn/interpolate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def interpolate(input, size=None, scale_factor=None, recompute_scale_factor=False, mode='nearest', align_corners=False, antialias=False): 4 | # Get input shape 5 | input_shape = tf.shape(input) 6 | 7 | # Compute the new size 8 | if size is None and scale_factor is not None: 9 | # Compute new size based on scale_factor 10 | new_size = tf.cast(input_shape[1:3], tf.float32) * scale_factor 11 | elif size is not None: 12 | # Use provided size 13 | new_size = tf.cast(size, tf.float32) 14 | else: 15 | raise ValueError("Either size or scale_factor must be defined.") 16 | 17 | if recompute_scale_factor: 18 | if scale_factor is None: 19 | raise ValueError("scale_factor must be defined if recompute_scale_factor is True.") 20 | # Recompute the scale factor based on the new size 21 | scale_factor_height = new_size[0] / tf.cast(input_shape[1], tf.float32) 22 | scale_factor_width = new_size[1] / tf.cast(input_shape[2], tf.float32) 23 | new_size = tf.stack([tf.cast(tf.cast(input_shape[1], tf.float32) * scale_factor_height, tf.float32), 24 | tf.cast(tf.cast(input_shape[2], tf.float32) * scale_factor_width, tf.float32)]) 25 | 26 | new_size = tf.cast(new_size, tf.int32) 27 | 28 | # Perform the interpolation 29 | if mode == 'bilinear': 30 | tf.compat.v1.image.resize_bilinear(input, size=new_size, align_corners=align_corners) 31 | elif mode == 'nearest': 32 | tf.compat.v1.image.resize_nearest_neighbor(input, size=new_size, align_corners=align_corners) 33 | elif mode == 'bicubic': 34 | tf.compat.v1.image.resize_bicubic(input, size=new_size, align_corners=align_corners) 35 | else: 36 | resize_result = tf.image.resize(input, size=new_size, method=mode, antialias=antialias) 37 | 38 | return resize_result 39 | -------------------------------------------------------------------------------- /Note/nn/lambda_callback.py: -------------------------------------------------------------------------------- 1 | class LambdaCallback: 2 | def __init__(self, 3 | on_train_begin=None, 4 | on_train_end=None, 5 | on_epoch_begin=None, 6 | on_epoch_end=None, 7 | on_episode_begin=None, 8 | on_episode_end=None, 9 | on_batch_begin=None, 10 | on_batch_end=None, 11 | on_test_begin=None, 12 | on_test_end=None): 13 | self.on_train_begin = on_train_begin 14 | self.on_train_end = on_train_end 15 | self.on_epoch_begin = on_epoch_begin 16 | self.on_epoch_end = on_epoch_end 17 | self.on_episode_begin = on_episode_begin 18 | self.on_episode_end = on_episode_end 19 | self.on_batch_begin = on_batch_begin 20 | self.on_batch_end = on_batch_end 21 | self.on_test_begin = on_test_begin 22 | self.on_test_end = on_test_end 23 | -------------------------------------------------------------------------------- /Note/nn/layer/BiRNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class BiRNN: 5 | def __init__(self,fw_cells,bw_cells): 6 | # Receive a list of forward and backward RNNCell objects as parameters 7 | self.fw_cells=fw_cells 8 | self.bw_cells=bw_cells 9 | self.output_size=self.fw_cells.output_size+self.bw_cells.output_size 10 | 11 | 12 | def __call__(self,data): 13 | # Get batch_size from data 14 | batch_size=tf.shape(data)[0] 15 | # Reverse the input data along the time dimension to get the backward input data 16 | data_bw=tf.reverse(data,axis=[1]) 17 | # Define a scan function to calculate the output and state of each time step 18 | def scan_fn(state,data,cell): 19 | output,state=cell.output(data,state) 20 | return output,state 21 | # Use tf.scan function to scan the forward and backward input data and get the forward and backward output data and state list 22 | outputs_fw=[] 23 | states_fw=[] 24 | outputs_bw=[] 25 | states_bw=[] 26 | for i in range(len(self.fw_cells)): 27 | cell=self.fw_cells[i] 28 | if i==0: # The first layer uses the original input data 29 | output_fw,state_fw=tf.scan(scan_fn,(data,cell),initializer=(tf.zeros([batch_size,32]),tf.zeros([batch_size,32])),swap_memory=True) 30 | else: # The later layers use the output data of the previous layer 31 | output_fw,state_fw=tf.scan(scan_fn,(outputs_fw[-1],cell),initializer=(tf.zeros([batch_size,32]),tf.zeros([batch_size,32])),swap_memory=True) 32 | outputs_fw.append(output_fw) 33 | states_fw.append(state_fw) 34 | cell=self.bw_cells[i] 35 | if i==0: # The first layer uses the reversed input data 36 | output_bw,state_bw=tf.scan(scan_fn,(data_bw,cell),initializer=(tf.zeros([batch_size,32]),tf.zeros([batch_size,32])),swap_memory=True) 37 | else: # The later layers use the output data of the previous layer 38 | output_bw,state_bw=tf.scan(scan_fn,(outputs_bw[-1],cell),initializer=(tf.zeros([batch_size,32]),tf.zeros([batch_size,32])),swap_memory=True) 39 | outputs_bw.append(output_bw) 40 | states_bw.append(state_bw) 41 | # Concatenate the forward and backward outputs and states to get the bidirectional outputs and states tensor 42 | output=tf.concat([outputs_fw[-1],outputs_bw[-1]],axis=-1) # Shape is [batch_size, seq_length, hidden_size * 2] 43 | state_fw=states_fw[-1] # Take the forward state of the last layer 44 | state_bw=states_bw[-1] # Take the backward state of the last layer 45 | state=tf.concat([state_fw[-1],state_bw[-1]],axis=-1) # Concatenate the forward and backward states of the last time step 46 | return output,state -------------------------------------------------------------------------------- /Note/nn/layer/BigBird_masks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class BigBird_masks: 4 | """Creates bigbird attention masks.""" 5 | 6 | def __init__(self, block_size): 7 | self._block_size = block_size 8 | 9 | def __call__(self, data, mask): 10 | encoder_shape = tf.shape(mask) 11 | mask = tf.cast(mask, data.dtype) 12 | batch_size, seq_length = encoder_shape[0], encoder_shape[1] 13 | # reshape for blocking 14 | blocked_encoder_mask = tf.reshape( 15 | mask, (batch_size, seq_length // self._block_size, self._block_size)) 16 | encoder_from_mask = tf.reshape(mask, (batch_size, 1, seq_length, 1)) 17 | encoder_to_mask = tf.reshape(mask, (batch_size, 1, 1, seq_length)) 18 | 19 | band_mask = create_band_mask_from_inputs(blocked_encoder_mask, 20 | blocked_encoder_mask) 21 | return [band_mask, encoder_from_mask, encoder_to_mask, blocked_encoder_mask] 22 | 23 | def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): 24 | """Create 3D attention mask from a 2D tensor mask. 25 | 26 | Args: 27 | from_blocked_mask: 2D Tensor of shape [batch_size, 28 | from_seq_length//from_block_size, from_block_size]. 29 | to_blocked_mask: int32 Tensor of shape [batch_size, 30 | to_seq_length//to_block_size, to_block_size]. 31 | 32 | Returns: 33 | float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, 34 | from_block_size, 3*to_block_size]. 35 | """ 36 | exp_blocked_to_pad = tf.concat([ 37 | to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 38 | 3:-1] 39 | ], 2) 40 | band_mask = tf.einsum("BLQ,BLK->BLQK", from_blocked_mask[:, 2:-2], 41 | exp_blocked_to_pad) 42 | band_mask = tf.expand_dims(band_mask, 1) 43 | return band_mask 44 | -------------------------------------------------------------------------------- /Note/nn/layer/ConvRNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class ConvRNN: 5 | def __init__(self,conv_layer,rnn_layer): 6 | # Receive a convolution layer object and an RNN layer object as parameters 7 | self.conv_layer=conv_layer 8 | self.rnn_layer=rnn_layer 9 | self.output_size=rnn_layer.output_size 10 | 11 | 12 | def __call__(self,data): 13 | # Get the number of timesteps in the input data 14 | timestep=data.shape[1] 15 | # Create an empty list to store the convolution results for each timestep 16 | conv_outputs=[] 17 | # Perform convolution operations on the input data for each timestep and add the results to the list 18 | for i in range(timestep): 19 | conv_output=self.conv_layer(data[:,i]) 20 | conv_outputs.append(conv_output) 21 | # Convert the list to a tensor with shape [batch_size, timestep, ...] 22 | conv_outputs=tf.stack(conv_outputs,axis=1) 23 | # Pass the convolution results to the RNN layer and get the final output 24 | rnn_output=self.rnn_layer(conv_outputs) 25 | return rnn_output -------------------------------------------------------------------------------- /Note/nn/layer/GCN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.layer.dense import dense 3 | from Note.nn.layer.dropout import dropout 4 | 5 | 6 | class GCNLayer: 7 | def __init__(self, in_features, out_features, bias=True): 8 | self.linear = dense(out_features, in_features, use_bias=bias) 9 | 10 | def __call__(self, x, adj): 11 | x = self.linear(x) 12 | return tf.matmul(adj, x) 13 | 14 | 15 | class GCN: 16 | def __init__(self, x_dim, h_dim, out_dim, nb_layers=2, dropout_rate=0.5, bias=True): 17 | layer_sizes = [x_dim] + [h_dim] * nb_layers + [out_dim] 18 | self.gcn_layers = [ 19 | GCNLayer(in_dim, out_dim, bias) 20 | for in_dim, out_dim in zip(layer_sizes[:-1], layer_sizes[1:]) 21 | ] 22 | self.dropout = dropout(dropout_rate) 23 | 24 | def __call__(self, x, adj): 25 | for layer in self.gcn_layers[:-1]: 26 | x = tf.nn.relu(layer(x, adj)) 27 | x = self.dropout(x) 28 | 29 | x = self.gcn_layers[-1](x, adj) 30 | return x -------------------------------------------------------------------------------- /Note/nn/layer/GRUCell.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf # import the TensorFlow library 2 | from Note import nn 3 | 4 | 5 | class GRUCell: # define a class for gated recurrent unit (GRU) cell 6 | def __init__(self,weight_shape,weight_initializer='Xavier',bias_initializer='zeros',use_bias=True,trainable=True,dtype='float32'): # define the constructor method 7 | self.weight=nn.initializer([weight_shape[0]+weight_shape[1],3*weight_shape[1]],weight_initializer,dtype,trainable) # initialize the weight matrix for all gates and candidate hidden state 8 | if use_bias==True: # if use bias is True 9 | self.bias=nn.initializer([3*weight_shape[1]],bias_initializer,dtype,trainable) # initialize the bias vector for all gates and candidate hidden state 10 | self.use_bias=use_bias # set the use bias flag 11 | self.output_size=weight_shape[-1] 12 | if use_bias==True: # if use bias is True 13 | self.param=[self.weight,self.bias] # store the parameters in a list 14 | else: # if use bias is False 15 | self.param=[self.weight] # store only the weight matrix in a list 16 | 17 | 18 | def __call__(self,data,state): # define the output method 19 | if data.dtype!=self.dtype: 20 | data=tf.cast(data,self.dtype) 21 | x=tf.concat([data,state],axis=-1) # concatenate the input data and state along the last dimension 22 | if self.use_bias==True: # if use bias is True 23 | z=tf.matmul(x,self.weight)+self.bias # calculate the linear transformation of concatenated data and weight matrix, plus bias vector 24 | else: # if use bias is False 25 | z=tf.matmul(x,self.weight) # calculate the linear transformation of concatenated data and weight matrix 26 | r,z,h=tf.split(z,3,axis=-1) # split the linear transformation into three parts: reset gate, update gate and candidate hidden state 27 | r=tf.nn.sigmoid(r) # apply activation function to the reset gate 28 | z=tf.nn.sigmoid(z) # apply activation function to the update gate 29 | h=tf.nn.tanh(h) # apply activation function to the candidate hidden state 30 | h_new=z*state+(1-z)*h # calculate the new hidden state value by combining the update gate, previous state and candidate hidden state values 31 | output=h_new # set the output value as the new hidden state value 32 | return output,h_new # return the output value and the new hidden state value -------------------------------------------------------------------------------- /Note/nn/layer/LSTMCell.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf # import the TensorFlow library 2 | from Note import nn 3 | 4 | 5 | class LSTMCell: # define a class for long short-term memory (LSTM) cell 6 | def __init__(self,weight_shape,weight_initializer='Xavier',bias_initializer='zeros',use_bias=True,trainable=True,dtype='float32'): # define the constructor method 7 | self.weight=nn.initializer([weight_shape[0]+weight_shape[1],4*weight_shape[1]],weight_initializer,dtype,trainable) # initialize the weight matrix for all gates and candidate cell state 8 | if use_bias==True: # if use bias is True 9 | self.bias=nn.initializer([4*weight_shape[1]],bias_initializer,dtype,trainable) # initialize the bias vector for all gates and candidate cell state 10 | self.use_bias=use_bias # set the use bias flag 11 | self.output_size=weight_shape[-1] 12 | if use_bias==True: # if use bias is True 13 | self.param=[self.weight,self.bias] # store the parameters in a list 14 | else: # if use bias is False 15 | self.param=[self.weight] # store only the weight matrix in a list 16 | 17 | 18 | def __call__(self,data,state): # define the output method 19 | if data.dtype!=self.dtype: 20 | data=tf.cast(data,self.dtype) 21 | x=tf.concat([data,state],axis=-1) # concatenate the input data and state along the last dimension 22 | if self.use_bias==True: # if use bias is True 23 | z=tf.matmul(x,self.weight)+self.bias # calculate the linear transformation of concatenated data and weight matrix, plus bias vector 24 | else: # if use bias is False 25 | z=tf.matmul(x,self.weight) # calculate the linear transformation of concatenated data and weight matrix 26 | i,f,o,c=tf.split(z,4,axis=-1) # split the linear transformation into four parts: input gate, forget gate, output gate and candidate cell state 27 | i=tf.nn.sigmoid(i) # apply activation function to the input gate 28 | f=tf.nn.sigmoid(f) # apply activation function to the forget gate 29 | o=tf.nn.sigmoid(o) # apply activation function to the output gate 30 | c=tf.nn.tanh(c) # apply activation function to the candidate cell state 31 | c_new=i*c+f*state # calculate the new cell state value by combining the input gate, candidate cell state and forget gate multiplied by previous state values 32 | output=o*tf.nn.tanh(c_new) # calculate the output value by multiplying the output gate and the tanh activation of the new cell state value 33 | return output,c_new # return the output value and the new cell state value -------------------------------------------------------------------------------- /Note/nn/layer/LoRALinear.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.layer.dense import dense 3 | 4 | class LoRALinear: 5 | @staticmethod 6 | def from_linear(linear, rank: int = 8): 7 | # TODO remove when input_dims and output_dims are attributes 8 | # on linear and quantized linear 9 | output_dims, input_dims = linear.weight.shape 10 | lora_lin = LoRALinear(input_dims, output_dims, rank) 11 | lora_lin.linear = linear 12 | return lora_lin 13 | 14 | def to_linear(self): 15 | linear = self.linear 16 | bias = linear.use_bias 17 | weight = linear.weight 18 | 19 | # Use the same type as the linear weight if not quantized 20 | dtype = weight.dtype 21 | 22 | output_dims, input_dims = weight.shape 23 | fused_linear = dense(output_dims, input_dims, bias=bias) 24 | 25 | lora_b = tf.cast((self.scale * tf.transpose(self.lora_b)), dtype) 26 | lora_a = tf.cast(tf.transpose(self.lora_a), dtype) 27 | fused_linear.weight = weight + tf.matmul(lora_b, lora_a) 28 | if bias: 29 | fused_linear.bias = linear.bias 30 | 31 | return fused_linear 32 | 33 | def __init__( 34 | self, 35 | input_dims: int, 36 | output_dims: int, 37 | lora_rank: int = 8, 38 | bias: bool = False, 39 | scale: float = 20.0, 40 | ): 41 | # Regular linear layer weights 42 | self.linear = dense(output_dims, input_dims, bias=bias) 43 | 44 | # Scale for low-rank update 45 | self.scale = scale 46 | 47 | # Low rank lora weights 48 | scale = 1 / tf.math.sqrt(input_dims) 49 | self.lora_a = tf.random.uniform( 50 | low=-scale, 51 | high=scale, 52 | shape=(input_dims, lora_rank), 53 | ) 54 | self.lora_b = tf.zeros(shape=(lora_rank, output_dims)) 55 | 56 | def __call__(self, data): 57 | dtype = self.linear.weight.dtype 58 | y = self.linear(tf.cast(data, dtype)) 59 | z = tf.matmul(tf.matmul(data, self.lora_a), self.lora_b) 60 | return y + self.scale * z -------------------------------------------------------------------------------- /Note/nn/layer/PReLU.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | 5 | class PReLU: 6 | """Parametric Rectified Linear Unit. 7 | 8 | It follows: 9 | 10 | ``` 11 | f(x) = alpha * x for x < 0 12 | f(x) = x for x >= 0 13 | ``` 14 | 15 | where `alpha` is a learned array with the same shape as x. 16 | 17 | Input shape: 18 | Arbitrary. Use the keyword argument `input_shape` 19 | (tuple of integers, does not include the samples axis) 20 | when using this layer as the first layer in a model. 21 | 22 | Output shape: 23 | Same shape as the input. 24 | 25 | Args: 26 | alpha_initializer: Initializer function for the weights. 27 | shared_axes: The axes along which to share learnable 28 | parameters for the activation function. 29 | For example, if the incoming feature maps 30 | are from a 2D convolution 31 | with output shape `(batch, height, width, channels)`, 32 | and you wish to share parameters across space 33 | so that each filter only has one set of parameters, 34 | set `shared_axes=[1, 2]`. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | input_shape=None, 40 | alpha_initializer="zeros", 41 | shared_axes=None, 42 | dtype='float32' 43 | ): 44 | self.alpha_initializer = alpha_initializer 45 | if shared_axes is None: 46 | self.shared_axes = None 47 | elif not isinstance(shared_axes, (list, tuple)): 48 | self.shared_axes = [shared_axes] 49 | else: 50 | self.shared_axes = list(shared_axes) 51 | self.dtype=dtype 52 | self.input_shape=input_shape 53 | if input_shape is not None: 54 | param_shape = list(input_shape[1:]) 55 | if self.shared_axes is not None: 56 | for i in self.shared_axes: 57 | param_shape[i - 1] = 1 58 | self.alpha = initializer( 59 | shape=param_shape, 60 | initializer=alpha_initializer, 61 | dtype=dtype 62 | ) 63 | self.param=[self.alpha] 64 | 65 | 66 | def __call__(self, data): 67 | if data.dtype!=self.dtype: 68 | data=tf.cast(data,self.dtype) 69 | if self.input_shape is None: 70 | self.input_shape=data.shape 71 | param_shape = list(self.input_shape[1:]) 72 | if self.shared_axes is not None: 73 | for i in self.shared_axes: 74 | param_shape[i - 1] = 1 75 | self.alpha = initializer( 76 | shape=param_shape, 77 | initializer=self.alpha_initializer, 78 | dtype=self.dtype 79 | ) 80 | self.param=[self.alpha] 81 | pos = tf.nn.relu(data) 82 | neg = -self.alpha * tf.nn.relu(-data) 83 | return pos + neg -------------------------------------------------------------------------------- /Note/nn/layer/RMSNorm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | 5 | class RMSNorm: 6 | def __init__(self, dims: int, eps: float = 1e-6, dtype='float32'): 7 | self.gamma = initializer((dims,), 'ones', dtype) 8 | self.eps = eps 9 | self.param = [self.gamma] 10 | 11 | def __call__(self, x): 12 | n = tf.math.rsqrt(tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) + self.eps) 13 | return self.gamma * x * n -------------------------------------------------------------------------------- /Note/nn/layer/RNNCell.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf # import the TensorFlow library 2 | from Note import nn 3 | from Note.nn.activation import activation_dict # import the activation function dictionary from Note.nn package 4 | 5 | 6 | class RNNCell: # define a class for recurrent neural network (RNN) cell 7 | def __init__(self,weight_shape,weight_initializer='Xavier',bias_initializer='zeros',activation=None,use_bias=True,trainable=True,dtype='float32'): # define the constructor method 8 | self.weight_i=nn.initializer(weight_shape,weight_initializer,dtype,trainable) # initialize the weight matrix for input data 9 | self.weight_s=nn.initializer([weight_shape[1],weight_shape[1]],weight_initializer,dtype,trainable) # initialize the weight matrix for previous state 10 | if use_bias==True: # if use bias is True 11 | self.bias=nn.initializer([weight_shape[1]],bias_initializer,dtype,trainable) # initialize the bias vector 12 | self.activation=activation_dict[activation] # get the activation function from the activation dictionary 13 | self.use_bias=use_bias # set the use bias flag 14 | self.output_size=weight_shape[-1] 15 | if use_bias==True: # if use bias is True 16 | self.param=[self.weight_i,self.weight_s,self.bias] # store the parameters in a list 17 | else: # if use bias is False 18 | self.param=[self.weight_i,self.weight_s] # store only the weight matrices in a list 19 | 20 | 21 | def __call__(self,data,state): # define the output method 22 | if data.dtype!=self.dtype: 23 | data=tf.cast(data,self.dtype) 24 | output=tf.matmul(data,self.weight_i)+tf.matmul(state,self.weight_s) # calculate the linear transformation of input data and previous state 25 | if self.use_bias==True: # if use bias is True 26 | output=output+self.bias # add the bias vector to the linear transformation 27 | if self.activation is not None: # if activation function is not None 28 | output=self.activation(output) # apply activation function to the linear transformation 29 | return output # return the output value -------------------------------------------------------------------------------- /Note/nn/layer/RoPE.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class RoPE: 4 | def __init__(self, dims: int, traditional: bool = False, base=None): 5 | self.dims = dims 6 | self.traditional = traditional 7 | self.base = base 8 | 9 | def _compute_rope(self, costheta, sintheta, x): 10 | x1 = x[..., : self.dims // 2] 11 | x2 = x[..., self.dims // 2 : self.dims] 12 | rx1 = x1 * costheta - x2 * sintheta 13 | rx2 = x1 * sintheta + x2 * costheta 14 | 15 | if self.dims < x.shape[-1]: 16 | rx = tf.concat([rx1, rx2, x[..., self.dims :]], axis=-1) 17 | else: 18 | rx = tf.concat([rx1, rx2], axis=-1) 19 | 20 | return rx 21 | 22 | def _compute_traditional_rope(self, costheta, sintheta, x): 23 | x1 = x[..., ::2] 24 | x2 = x[..., 1::2] 25 | rx1 = x1 * costheta - x2 * sintheta 26 | rx2 = x1 * sintheta + x2 * costheta 27 | 28 | if self.dims < x.shape[-1]: 29 | raise NotImplementedError( 30 | "RoPE doesn't implement partial traditional application" 31 | ) 32 | 33 | rx = tf.concat([rx1[..., None], rx2[..., None]], axis=-1) 34 | 35 | return rx 36 | 37 | def __call__(self, x, offset: int = 0): 38 | shape = x.shape 39 | x = tf.reshape(x, (-1, shape[-2], shape[-1])) 40 | N = x.shape[1] + offset 41 | costheta, sintheta = RoPE.create_cos_sin_theta( 42 | N, self.dims, offset=offset, base=self.base, dtype=x.dtype 43 | ) 44 | 45 | rope = ( 46 | self._compute_traditional_rope if self.traditional else self._compute_rope 47 | ) 48 | rx = rope(costheta, sintheta, x) 49 | 50 | return tf.reshape(rx, shape) 51 | 52 | @staticmethod 53 | def create_cos_sin_theta( 54 | N: int, 55 | D: int, 56 | offset: int = 0, 57 | base: float = 10000, 58 | dtype=tf.float32, 59 | ): 60 | D = D // 2 61 | positions = tf.range(offset, N, dtype=dtype) 62 | freqs = tf.math.exp( 63 | -tf.range(0, D, dtype=dtype) * (tf.math.log(base) / D) 64 | ) 65 | theta = tf.reshape(positions, (-1, 1)) * tf.reshape(freqs, (1, -1)) 66 | costheta = tf.math.cos(theta) 67 | sintheta = tf.math.sin(theta) 68 | 69 | return costheta, sintheta -------------------------------------------------------------------------------- /Note/nn/layer/SwitchGLU.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | import math 4 | 5 | class SwitchLinear: 6 | def __init__( 7 | self, input_dims: int, output_dims: int, num_experts: int, bias: bool = True 8 | ): 9 | scale = math.sqrt(1 / input_dims) 10 | self.weight = tf.Variable(tf.random.uniform( 11 | minval=-scale, 12 | maxval=scale, 13 | shape=(num_experts, input_dims, output_dims), 14 | )) 15 | nn.Model.param.append(self.weight) 16 | 17 | self.use_bias=bias 18 | if bias: 19 | self.bias = tf.Variable(tf.zeros((num_experts, output_dims))) 20 | nn.Model.param.append(self.bias) 21 | 22 | @property 23 | def input_dims(self): 24 | return self.weight.shape[1] 25 | 26 | @property 27 | def output_dims(self): 28 | return self.weight.shape[2] 29 | 30 | @property 31 | def num_experts(self): 32 | return self.weight.shape[0] 33 | 34 | def __call__(self, x, indices): 35 | x = nn.gather_mm(x, self.weight, indices) 36 | if self.use_bias: 37 | x = x + tf.expand_dims(tf.gather(self.bias, indices), -2) 38 | return x 39 | 40 | class SwitchGLU: 41 | def __init__( 42 | self, 43 | input_dims: int, 44 | hidden_dims: int, 45 | num_experts: int, 46 | activation=tf.nn.silu, 47 | bias: bool = False, 48 | ): 49 | 50 | self.gate_proj = SwitchLinear(input_dims, hidden_dims, num_experts, bias=bias) 51 | self.up_proj = SwitchLinear(input_dims, hidden_dims, num_experts, bias=bias) 52 | self.down_proj = SwitchLinear(hidden_dims, input_dims, num_experts, bias=bias) 53 | self.activation = activation 54 | 55 | def __call__(self, x, indices): 56 | 57 | x_up = self.up_proj(x, indices) 58 | x_gate = self.gate_proj(x, indices) 59 | x = self.down_proj(self.activation(x_gate) * x_up, indices) 60 | 61 | return x -------------------------------------------------------------------------------- /Note/nn/layer/TLU.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | 5 | class TLU: 6 | r"""Thresholded Linear Unit. 7 | 8 | An activation function which is similar to ReLU 9 | but with a learned threshold that benefits models using FRN(Filter Response 10 | Normalization). Original paper: https://arxiv.org/pdf/1911.09737. 11 | 12 | Input shape: 13 | Arbitrary. Use the keyword argument `input_shape` 14 | (tuple of integers, does not include the samples axis) 15 | when using this layer as the first layer in a model. 16 | 17 | Output shape: 18 | Same shape as the input. 19 | 20 | Args: 21 | affine: `bool`. Whether to make it TLU-Affine or not 22 | which has the form $\max(x, \alpha*x + \tau)$` 23 | """ 24 | 25 | def __init__( 26 | self, 27 | input_shape=None, 28 | affine: bool = False, 29 | tau_initializer = "zeros", 30 | alpha_initializer = "zeros", 31 | dtype='float32' 32 | ): 33 | self.affine = affine 34 | self.tau_initializer = tau_initializer 35 | if self.affine: 36 | self.alpha_initializer = alpha_initializer 37 | self.dtype=dtype 38 | self.input_shape=input_shape 39 | if input_shape is not None: 40 | param_shape = list(input_shape[1:]) 41 | self.tau = initializer(param_shape, self.tau_initializer, dtype) 42 | if self.affine: 43 | self.alpha = initializer(param_shape, self.alpha_initializer, dtype) 44 | 45 | 46 | def __call__(self, data): 47 | if data.dtype!=self.dtype: 48 | data=tf.cast(data,self.dtype) 49 | if self.input_shape is None: 50 | self.input_shape=data.shape 51 | param_shape = list(self.input_shape[1:]) 52 | self.tau = initializer(param_shape, self.tau_initializer, self.dtype) 53 | if self.affine: 54 | self.alpha = initializer(param_shape, self.alpha_initializer, self.dtype) 55 | v = self.alpha * data if self.affine else 0 56 | return tf.maximum(data, self.tau + v) -------------------------------------------------------------------------------- /Note/nn/layer/Transformer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.layer.TransformerEncoder import TransformerEncoder 3 | from Note.nn.layer.TransformerDecoder import TransformerDecoder 4 | from Note.nn.layer.TransformerEncoderLayer import TransformerEncoderLayer 5 | from Note.nn.layer.TransformerDecoderLayer import TransformerDecoderLayer 6 | from Note.nn.layer.layer_norm import layer_norm 7 | 8 | 9 | class Transformer: 10 | def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, 11 | num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, 12 | activation = tf.nn.relu, 13 | custom_encoder = None, custom_decoder = None, 14 | layer_norm_eps: float = 1e-5, norm_first: bool = False, 15 | bias: bool = True, dtype='float32'): 16 | if custom_encoder is not None: 17 | self.encoder = custom_encoder 18 | else: 19 | encoder_layers = [TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, 20 | activation, layer_norm_eps, norm_first, 21 | bias) for _ in range(num_encoder_layers)] 22 | encoder_norm = layer_norm(d_model, epsilon=layer_norm_eps, dtype=dtype) 23 | self.encoder = TransformerEncoder(encoder_layers, num_encoder_layers, encoder_norm) 24 | 25 | if custom_decoder is not None: 26 | self.decoder = custom_decoder 27 | else: 28 | decoder_layers = [TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, 29 | activation, layer_norm_eps, norm_first, 30 | bias) for _ in range(num_decoder_layers)] 31 | decoder_norm = layer_norm(d_model, epsilon=layer_norm_eps, dtype=dtype) 32 | self.decoder = TransformerDecoder(decoder_layers, num_decoder_layers, decoder_norm) 33 | 34 | self.d_model = d_model 35 | self.nhead = nhead 36 | self.dtype=dtype 37 | 38 | 39 | def __call__(self, src, tgt, src_mask = None, tgt_mask = None, memory_mask = None, train_flag=True): 40 | if src.dtype!=self.dtype: 41 | src=tf.cast(src,self.dtype) 42 | if tgt.dtype!=self.dtype: 43 | tgt=tf.cast(tgt,self.dtype) 44 | if src_mask is not None and src_mask.dtype!=self.dtype: 45 | src_mask=tf.cast(src_mask,self.dtype) 46 | if tgt_mask is not None and tgt_mask.dtype!=self.dtype: 47 | tgt_mask=tf.cast(tgt_mask,self.dtype) 48 | if memory_mask is not None and memory_mask.dtype!=self.dtype: 49 | memory_mask=tf.cast(memory_mask,self.dtype) 50 | memory = self.encoder(src, mask=src_mask, train_flag=train_flag) 51 | output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, train_flag=train_flag) 52 | return output -------------------------------------------------------------------------------- /Note/nn/layer/TransformerDecoder.py: -------------------------------------------------------------------------------- 1 | class TransformerDecoder: 2 | def __init__(self, decoder_layers, num_layers, norm=None): 3 | self.layers = decoder_layers 4 | self.num_layers = num_layers 5 | self.norm = norm 6 | 7 | 8 | def __call__(self, tgt, memory, tgt_mask = None, 9 | memory_mask = None, train_flag=True): 10 | output = tgt 11 | 12 | for mod in self.layers: 13 | output = mod.output(output, memory, tgt_mask=tgt_mask, 14 | memory_mask=memory_mask, train_flag=train_flag 15 | ) 16 | 17 | if self.norm is not None: 18 | output = self.norm.output(output) 19 | 20 | return output -------------------------------------------------------------------------------- /Note/nn/layer/TransformerEncoder.py: -------------------------------------------------------------------------------- 1 | class TransformerEncoder: 2 | def __init__(self, encoder_layers, num_layers, norm=None): 3 | self.layers = encoder_layers 4 | self.num_layers = num_layers 5 | self.norm = norm 6 | 7 | 8 | def __call__( 9 | self, 10 | src, 11 | mask = None, 12 | train_flag=True 13 | ): 14 | output = src 15 | 16 | for mod in self.layers: 17 | output = mod.output(output, src_mask=mask, train_flag=train_flag) 18 | 19 | if self.norm is not None: 20 | output = self.norm.output(output) 21 | 22 | return output -------------------------------------------------------------------------------- /Note/nn/layer/TransformerEncoderLayer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.layer.multihead_attention import multihead_attention 3 | from Note.nn.layer.dense import dense 4 | from Note.nn.layer.layer_norm import layer_norm 5 | from Note.nn.layer.dropout import dropout 6 | from Note.nn.activation import activation_dict 7 | 8 | 9 | class TransformerEncoderLayer: 10 | def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout_rate: float = 0.1, 11 | activation = tf.nn.relu, 12 | layer_norm_eps: float = 1e-5, norm_first: bool = False, 13 | bias: bool = True, dtype='float32'): 14 | self.self_attn = multihead_attention(nhead, input_size=d_model, use_bias=bias, dtype=dtype) 15 | # Implementation of Feedforward model 16 | self.linear1 = dense(dim_feedforward, d_model, use_bias=bias, dtype=dtype) 17 | self.dropout = dropout(dropout_rate) 18 | self.linear2 = dense(d_model, dim_feedforward, use_bias=bias, dtype=dtype) 19 | 20 | self.norm_first = norm_first 21 | self.norm1 = layer_norm(d_model, epsilon=layer_norm_eps, dtype=dtype) 22 | self.norm2 = layer_norm(d_model, epsilon=layer_norm_eps, dtype=dtype) 23 | self.dropout1 = dropout(dropout_rate) 24 | self.dropout2 = dropout(dropout_rate) 25 | 26 | if isinstance(activation, str): 27 | activation = activation_dict[activation] 28 | else: 29 | self.activation = activation 30 | 31 | 32 | def __call__( 33 | self, 34 | src, 35 | src_mask=None, 36 | train_flag=True 37 | ): 38 | 39 | x = src 40 | if self.norm_first: 41 | x = x + self._sa_block(self.norm1(x), src_mask, train_flag) 42 | x = x + self._ff_block(self.norm2(x), train_flag) 43 | else: 44 | x = self.norm1(x + self._sa_block(x, src_mask, train_flag)) 45 | x = self.norm2(x + self._ff_block(x, train_flag)) 46 | 47 | return x 48 | 49 | 50 | # self-attention block 51 | def _sa_block(self, x, 52 | attn_mask=None, train_flag=True): 53 | x = self.self_attn(x, 54 | mask=attn_mask, 55 | )[0] 56 | if train_flag: 57 | return self.dropout1(x) 58 | else: 59 | return x 60 | 61 | 62 | # feed forward block 63 | def _ff_block(self, x, train_flag): 64 | if train_flag: 65 | x = self.linear2(self.dropout(self.activation(self.linear1(x)))) 66 | return self.dropout2(x) 67 | else: 68 | return self.linear2(self.activation(self.linear1(x))) -------------------------------------------------------------------------------- /Note/nn/layer/add.py: -------------------------------------------------------------------------------- 1 | class add: 2 | def __init__(self): 3 | self.save_data_count=None 4 | 5 | 6 | def __call__(self,data): 7 | if self.save_data_count!=None: 8 | output=data.pop(0) 9 | for i in range(1,self.save_data_count): 10 | output+=data.pop(0) 11 | else: 12 | output=data[0] 13 | for i in range(1,len(data)): 14 | output+=data[i] 15 | return output -------------------------------------------------------------------------------- /Note/nn/layer/additive_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | 5 | class additive_attention: 6 | def __init__(self,input_size=None, use_scale=True, dtype='float32'): 7 | self.use_scale = use_scale 8 | self.dtype=dtype 9 | if input_size!=None and use_scale: 10 | self.scale = initializer([input_size], 'Xavier', dtype) 11 | self.param=[self.scale] 12 | 13 | def build(self): 14 | self.output_size=self.input_size 15 | if self.input_size!=None and self.use_scale: 16 | self.scale = initializer([self.input_size], 'Xavier', self.dtype) 17 | self.param=[self.scale] 18 | return 19 | 20 | def __call__(self, query, key): 21 | """Calculates attention scores as a nonlinear sum of query and key. 22 | 23 | Args: 24 | query: Query tensor of shape `[batch_size, Tq, dim]`. 25 | key: Key tensor of shape `[batch_size, Tv, dim]`. 26 | Returns: 27 | Tensor of shape `[batch_size, Tq, Tv]`. 28 | """ 29 | if query.dtype!=self.dtype: 30 | query=tf.cast(query,self.dtype) 31 | if key.dtype!=self.dtype: 32 | key=tf.cast(key,self.dtype) 33 | if self.input_size==None: 34 | self.input_size=query.shape[-1] 35 | self.build() 36 | # Reshape tensors to enable broadcasting. 37 | # Reshape into [batch_size, Tq, 1, dim]. 38 | q_reshaped = tf.expand_dims(query, axis=-2) 39 | # Reshape into [batch_size, 1, Tv, dim]. 40 | k_reshaped = tf.expand_dims(key, axis=-3) 41 | if self.use_scale: 42 | scale = self.scale 43 | else: 44 | scale = 1.0 45 | return tf.reduce_sum(scale * tf.tanh(q_reshaped + k_reshaped), axis=-1) -------------------------------------------------------------------------------- /Note/nn/layer/alpha_dropout.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class alpha_dropout: 5 | """Applies Alpha Dropout to the input. 6 | 7 | Alpha Dropout is a `Dropout` that keeps mean and variance of inputs 8 | to their original values, in order to ensure the self-normalizing property 9 | even after this dropout. 10 | Alpha Dropout fits well to Scaled Exponential Linear Units 11 | by randomly setting activations to the negative saturation value. 12 | 13 | Args: 14 | rate: float, drop probability (as with `Dropout`). 15 | The multiplicative noise will have 16 | standard deviation `sqrt(rate / (1 - rate))`. 17 | seed: Integer, optional random seed to enable deterministic behavior. 18 | 19 | Call arguments: 20 | inputs: Input tensor (of any rank). 21 | training: Python boolean indicating whether the layer should behave in 22 | training mode (adding dropout) or in inference mode (doing nothing). 23 | 24 | Input shape: 25 | Arbitrary. Use the keyword argument `input_shape` 26 | (tuple of integers, does not include the samples axis) 27 | when using this layer as the first layer in a model. 28 | 29 | Output shape: 30 | Same shape as input. 31 | """ 32 | 33 | def __init__(self, rate, noise_shape=None, seed=7): 34 | self.rate = rate 35 | self.noise_shape = noise_shape 36 | self.seed = seed 37 | self.supports_masking = True 38 | 39 | def _get_noise_shape(self, inputs): 40 | return self.noise_shape if self.noise_shape else tf.shape(inputs) 41 | 42 | def __call__(self, inputs, train_flag=None): 43 | if 0.0 < self.rate < 1.0: 44 | noise_shape = self._get_noise_shape(inputs) 45 | 46 | def dropped_inputs(inputs=inputs, rate=self.rate): 47 | alpha = 1.6732632423543772848170429916717 48 | scale = 1.0507009873554804934193349852946 49 | alpha_p = -alpha * scale 50 | 51 | kept_idx = tf.math.greater_equal(tf.random.uniform(noise_shape), rate) 52 | kept_idx = tf.cast(kept_idx, inputs.dtype) 53 | 54 | # Get affine transformation params 55 | a = ((1 - rate) * (1 + rate * alpha_p**2)) ** -0.5 56 | b = -a * alpha_p * rate 57 | 58 | # Apply mask 59 | x = inputs * kept_idx + alpha_p * (1 - kept_idx) 60 | 61 | # Do affine transformation 62 | return a * x + b 63 | 64 | return tf.cond(train_flag, lambda: dropped_inputs, lambda: inputs) 65 | return inputs -------------------------------------------------------------------------------- /Note/nn/layer/attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | 5 | class attention: # define a class for attention mechanism 6 | def __init__(self, use_scale=False, score_mode="dot", dtype='float32'): 7 | self.use_scale = use_scale 8 | self.score_mode = score_mode 9 | self.dtype=dtype 10 | self.param=[] 11 | if use_scale: 12 | self.scale = initializer((),'ones',dtype) 13 | self.param.append(self.scale) 14 | if score_mode == "concat": 15 | self.concat_score_weight = initializer((),'ones',dtype) 16 | self.param.append(self.concat_score_weight) 17 | 18 | 19 | def __call__(self, query, value, key=None): # define the output method 20 | if query.dtype!=self.dtype: 21 | query=tf.cast(query,self.dtype) 22 | if value.dtype!=self.dtype: 23 | value=tf.cast(value,self.dtype) 24 | if key is not None and key.dtype!=self.dtype: 25 | key=tf.cast(key,self.dtype) 26 | if self.score_mode == "dot": 27 | if key==None: 28 | scores = tf.matmul(query, value, transpose_b=True) 29 | else: 30 | scores = tf.matmul(query, key, transpose_b=True) 31 | if self.scale is not None: 32 | scores *= self.scale 33 | elif self.score_mode == "concat": 34 | # Reshape tensors to enable broadcasting. 35 | # Reshape into [batch_size, Tq, 1, dim]. 36 | q_reshaped = tf.expand_dims(query, axis=-2) 37 | # Reshape into [batch_size, 1, Tv, dim]. 38 | if key==None: 39 | k_reshaped = tf.expand_dims(value, axis=-3) 40 | else: 41 | k_reshaped = tf.expand_dims(key, axis=-3) 42 | if self.scale is not None: 43 | scores = self.concat_score_weight * tf.reduce_sum( 44 | tf.tanh(self.scale * (q_reshaped + k_reshaped)), axis=-1 45 | ) 46 | else: 47 | scores = self.concat_score_weight * tf.reduce_sum( 48 | tf.tanh(q_reshaped + k_reshaped), axis=-1 49 | ) 50 | distribution = tf.nn.softmax(scores) 51 | return tf.matmul(distribution, value) -------------------------------------------------------------------------------- /Note/nn/layer/average.py: -------------------------------------------------------------------------------- 1 | class average: 2 | def __init__(self): 3 | self.save_data_count=None 4 | 5 | 6 | def __call__(self,data): 7 | if self.save_data_count!=None: 8 | output=data.pop(0) 9 | for i in range(1,self.save_data_count): 10 | output+=data.pop(0) 11 | return output/self.save_data_count 12 | else: 13 | output=data[0] 14 | for i in range(1,len(data)): 15 | output+=data[i] 16 | return output/len(data) -------------------------------------------------------------------------------- /Note/nn/layer/avg_pool1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class avg_pool1d: 6 | def __init__(self, kernel_size, strides=None, padding=0, count_include_pad=True): 7 | """ 8 | Args: 9 | kernel_size: int, the size of the pooling window. 10 | strides: int, stride of the pooling operation. 11 | padding: int, str, or tuple, the padding applied to the input. 12 | count_include_pad: bool, whether to include zero padding in the average calculation. 13 | """ 14 | self.kernel_size = kernel_size 15 | self.strides = strides if strides is not None else kernel_size 16 | self.padding = padding 17 | self.count_include_pad = count_include_pad 18 | 19 | if not isinstance(padding, str): 20 | self.zeropadding1d = nn.zeropadding1d(padding=padding) 21 | 22 | def __call__(self, data): 23 | if not isinstance(self.padding, str): 24 | padded_data = self.zeropadding1d(data) 25 | padding = 'VALID' 26 | else: 27 | padded_data = data 28 | padding = self.padding 29 | 30 | # Apply avg_pool1d 31 | pooled = tf.nn.avg_pool1d( 32 | padded_data, ksize=self.kernel_size, strides=self.strides, padding=padding 33 | ) 34 | 35 | if not self.count_include_pad and not isinstance(self.padding, str): 36 | # Calculate the effective kernel size for each window 37 | k = self.kernel_size if isinstance(self.kernel_size, int) else self.kernel_size 38 | 39 | # Compute the mask of valid elements (non-zero-padded) 40 | valid_mask = tf.ones_like(data, dtype=data.dtype) 41 | valid_mask = self.zeropadding1d(valid_mask) 42 | 43 | # Apply the same pooling operation to the mask 44 | valid_counts = tf.nn.avg_pool1d( 45 | valid_mask, ksize=self.kernel_size, strides=self.strides, padding='VALID' 46 | ) * k 47 | 48 | # Avoid division by zero 49 | valid_counts = tf.maximum(valid_counts, 1.0) 50 | 51 | # Adjust the pooled output to exclude zero-padded elements 52 | pooled = pooled * k / valid_counts 53 | 54 | return pooled 55 | -------------------------------------------------------------------------------- /Note/nn/layer/avg_pool2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class avg_pool2d: 6 | def __init__(self, kernel_size, strides=None, padding=0, count_include_pad=True): 7 | """ 8 | Args: 9 | kernel_size: int or tuple, the size of the pooling window. 10 | strides: int or tuple, stride of the pooling operation. 11 | padding: int, str, or tuple, the padding applied to the input. 12 | count_include_pad: bool, whether to include zero padding in the average calculation. 13 | """ 14 | self.kernel_size = kernel_size 15 | self.strides = strides if strides is not None else kernel_size 16 | self.padding = padding 17 | self.count_include_pad = count_include_pad 18 | 19 | if not isinstance(padding, str): 20 | self.zeropadding2d = nn.zeropadding2d(padding=padding) 21 | 22 | def __call__(self, data): 23 | if not isinstance(self.padding, str): 24 | padded_data = self.zeropadding2d(data) 25 | padding = 'VALID' 26 | else: 27 | padded_data = data 28 | padding = self.padding 29 | 30 | # Apply avg_pool2d 31 | pooled = tf.nn.avg_pool2d( 32 | padded_data, ksize=self.kernel_size, strides=self.strides, padding=padding 33 | ) 34 | 35 | if not self.count_include_pad and not isinstance(self.padding, str): 36 | # Calculate the effective kernel sizes for each window 37 | k_h, k_w = self.kernel_size if isinstance(self.kernel_size, (tuple, list)) else (self.kernel_size, self.kernel_size) 38 | 39 | # Compute the mask of valid elements (non-zero-padded) 40 | valid_mask = tf.ones_like(data, dtype=data.dtype) 41 | valid_mask = self.zeropadding2d(valid_mask) 42 | 43 | # Apply the same pooling operation to the mask 44 | valid_counts = tf.nn.avg_pool2d( 45 | valid_mask, ksize=self.kernel_size, strides=self.strides, padding='VALID' 46 | ) * (k_h * k_w) 47 | 48 | # Avoid division by zero 49 | valid_counts = tf.maximum(valid_counts, 1.0) 50 | 51 | # Adjust the pooled output to exclude zero-padded elements 52 | pooled = pooled * (k_h * k_w) / valid_counts 53 | 54 | return pooled 55 | -------------------------------------------------------------------------------- /Note/nn/layer/avg_pool3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class avg_pool3d: 6 | def __init__(self, kernel_size, strides=None, padding=0, count_include_pad=True): 7 | """ 8 | Args: 9 | kernel_size: int or tuple, the size of the pooling window. 10 | strides: int or tuple, stride of the pooling operation. 11 | padding: int, str, or tuple, the padding applied to the input. 12 | count_include_pad: bool, whether to include zero padding in the average calculation. 13 | """ 14 | self.kernel_size = kernel_size 15 | self.strides = strides if strides is not None else kernel_size 16 | self.padding = padding 17 | self.count_include_pad = count_include_pad 18 | 19 | if not isinstance(padding, str): 20 | self.zeropadding3d = nn.zeropadding3d(padding=padding) 21 | 22 | def __call__(self, data): 23 | if not isinstance(self.padding, str): 24 | padded_data = self.zeropadding3d(data) 25 | padding = 'VALID' 26 | else: 27 | padded_data = data 28 | padding = self.padding 29 | 30 | # Apply avg_pool3d 31 | pooled = tf.nn.avg_pool3d( 32 | padded_data, ksize=self.kernel_size, strides=self.strides, padding=padding 33 | ) 34 | 35 | if not self.count_include_pad and not isinstance(self.padding, str): 36 | # Calculate the effective kernel sizes for each window 37 | k_d, k_h, k_w = ( 38 | self.kernel_size 39 | if isinstance(self.kernel_size, (tuple, list)) 40 | else (self.kernel_size, self.kernel_size, self.kernel_size) 41 | ) 42 | 43 | # Compute the mask of valid elements (non-zero-padded) 44 | valid_mask = tf.ones_like(data, dtype=tf.float32) 45 | valid_mask = self.zeropadding3d(valid_mask) 46 | 47 | # Apply the same pooling operation to the mask 48 | valid_counts = tf.nn.avg_pool3d( 49 | valid_mask, ksize=self.kernel_size, strides=self.strides, padding='VALID' 50 | ) * (k_d * k_h * k_w) 51 | 52 | # Avoid division by zero 53 | valid_counts = tf.maximum(valid_counts, 1.0) 54 | 55 | # Adjust the pooled output to exclude zero-padded elements 56 | pooled = pooled * (k_d * k_h * k_w) / valid_counts 57 | 58 | return pooled 59 | -------------------------------------------------------------------------------- /Note/nn/layer/axial_positional_encoding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class axial_positional_encoding: 6 | """A class for generating axial positional encoding for Reformer models.""" 7 | 8 | def __init__(self, d_model, axial_shape, initializer='Xavier', trainable=True, dtype='float32'): 9 | """Initializes the axial positional encoding. 10 | 11 | Args: 12 | d_model: int, the dimension of the model embeddings. 13 | axial_shape: tuple of int, the shape of the input sequence, such as (batch_size, seq_length). 14 | """ 15 | self.d_model = d_model 16 | self.axial_shape = axial_shape 17 | self.num_axial_pos_embs = len(axial_shape) 18 | self.d_axial_pos_embs = d_model // self.num_axial_pos_embs 19 | 20 | # Create the learnable parameters for each axial dimension 21 | self.weights = [] 22 | 23 | self.output_size = d_model 24 | 25 | self.dtype=dtype 26 | 27 | # Create a list to store the parameters 28 | self.param = [] 29 | 30 | if trainable==True: 31 | for i, dim in enumerate(axial_shape): 32 | weight = nn.initializer((dim, self.d_axial_pos_embs), initializer, dtype) 33 | self.weights.append(weight) 34 | self.param.append(weight) 35 | 36 | 37 | def __call__(self, data): 38 | """Generates the axial positional encoding for the input tensor. 39 | 40 | Args: 41 | data: tf.Tensor of shape [batch_size, seq_length, d_model], the input tensor. 42 | 43 | Returns: 44 | tf.Tensor of shape [batch_size, seq_length, d_model], the output tensor with axial positional encoding added. 45 | """ 46 | if data.dtype!=self.dtype: 47 | data=tf.cast(data,self.dtype) 48 | 49 | # Reshape the input tensor to match the axial shape 50 | data = tf.reshape(data, (-1,) + self.axial_shape + (self.d_model,)) 51 | 52 | # Concatenate the positional embeddings along the last dimension 53 | pos_emb = tf.concat( 54 | [tf.expand_dims(weight, axis=0) for weight in self.weights], 55 | axis=-1 56 | ) 57 | 58 | # Broadcast the positional embeddings to the input shape 59 | pos_emb = tf.broadcast_to(pos_emb, data.shape) 60 | 61 | # Add the positional embeddings to the input tensor 62 | data = data + pos_emb 63 | 64 | # Reshape the output tensor to the original shape 65 | output = tf.reshape(data, (-1, self.axial_shape[0] * self.axial_shape[1], self.d_model)) 66 | 67 | return output -------------------------------------------------------------------------------- /Note/nn/layer/bilinear.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | from typing import Tuple 4 | 5 | class bilinear: 6 | def __init__(self, embedding_dim: int, output_dim: int, dtype='float32'): 7 | """Initializer. 8 | 9 | Args: 10 | embedding_dim: An integer that indicates the embedding dimension of the 11 | interacting vectors. 12 | output_dim: An integer that indicates the output dimension of the layer. 13 | """ 14 | self._embedding_dim = embedding_dim 15 | self._output_dim = output_dim 16 | self.dtype = dtype 17 | self._bilinear_weight = initializer( 18 | shape=(self._embedding_dim, self._embedding_dim, self._output_dim), 19 | initializer=['normal', 0.0, 1. /self._embedding_dim], 20 | dtype=dtype) 21 | self._linear_weight_1 = initializer( 22 | shape=(self._embedding_dim, self._output_dim), 23 | initializer=['normal', 0.0, 1. / tf.math.sqrt(self._embedding_dim)], 24 | dtype=dtype) 25 | self._linear_weight_2 = initializer( 26 | shape=(self._embedding_dim, self._output_dim), 27 | initializer=['normal', 0.0, 1. / tf.math.sqrt(self._embedding_dim)], 28 | dtype=dtype) 29 | self._bias = initializer( 30 | shape=(self._output_dim), 31 | initializer='zeros', 32 | dtype=dtype) 33 | 34 | def __call__(self, data: Tuple[tf.Tensor]) -> tf.Tensor: 35 | """Computes bilinear interaction between two vector tensors. 36 | 37 | Args: 38 | data: A pair of tensors of the same shape [batch_size, embedding_dim]. 39 | 40 | Returns: 41 | A tensor, of shape [batch_size, output_dim], computed by the bilinear 42 | interaction. 43 | """ 44 | # Input of the function must be a list of two tensors. 45 | vec_1, vec_2 = data 46 | if vec_1.dtype!=self.dtype: 47 | vec_1=tf.cast(vec_1, self.dtype) 48 | if vec_2.dtype!=self.dtype: 49 | vec_2=tf.cast(vec_2, self.dtype) 50 | return tf.einsum( 51 | 'bi,ijk,bj->bk', vec_1, self._bilinear_weight, vec_2) + tf.einsum( 52 | 'bi,ik->bk', vec_1, self._linear_weight_1) + tf.einsum( 53 | 'bi,ik->bk', vec_2, self._linear_weight_2) + self._bias -------------------------------------------------------------------------------- /Note/nn/layer/concat.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class concat: 5 | def __init__(self,axis=-1): 6 | self.axis=axis 7 | 8 | 9 | def __call__(self,data): 10 | output=data.pop(0) 11 | for i in range(1,len(data)): 12 | output=tf.concat([output,data.pop(0)],axis=self.axis) 13 | return output 14 | -------------------------------------------------------------------------------- /Note/nn/layer/cropping1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class cropping1d: 5 | def __init__(self, cropping=1): 6 | if isinstance(cropping, int): 7 | self.cropping = tf.constant([[0, 0], [cropping, cropping], [0, 0]]) 8 | elif isinstance(cropping, list) and len(cropping) == 2: 9 | self.cropping = tf.constant([[0, 0], [cropping[0], cropping[1]], [0, 0]]) 10 | else: 11 | raise ValueError("Invalid cropping argument. It should be an int or a list of two ints.") 12 | 13 | 14 | def __call__(self, data): 15 | shape = tf.shape(data) 16 | size = shape[1] - self.cropping[1][0] - self.cropping[1][1] 17 | return tf.slice(data, begin=[0, self.cropping[1][0], 0], size=[-1, size, -1]) -------------------------------------------------------------------------------- /Note/nn/layer/cropping2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class cropping2d: 5 | def __init__(self, cropping=1): 6 | if isinstance(cropping, int): 7 | self.cropping = tf.constant([[0, 0], [cropping, cropping], [cropping, cropping], [0, 0]]) 8 | elif isinstance(cropping, list) and len(cropping) == 2: 9 | self.cropping = tf.constant([[0, 0], [cropping[0], cropping[0]], [cropping[1], cropping[1]], [0, 0]]) 10 | elif isinstance(cropping, list) and len(cropping) == 4: 11 | self.cropping = tf.constant([[0, 0], [cropping[0], cropping[1]], [cropping[2], cropping[3]], [0, 0]]) 12 | else: 13 | raise ValueError("Invalid cropping argument. It should be an int or a list of two or four ints.") 14 | 15 | 16 | def __call__(self, data): 17 | return tf.slice(data, begin=[0, self.cropping[1][0], self.cropping[2][0], 0], size=[-1, -1 - self.cropping[1][0] - self.cropping[1][1], -1 - self.cropping[2][0] - self.cropping[2][1], -1]) -------------------------------------------------------------------------------- /Note/nn/layer/cropping3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class cropping3d: 5 | def __init__(self, cropping=1): 6 | if isinstance(cropping, int): 7 | self.cropping = tf.constant([[0, 0], [cropping, cropping], [cropping, cropping], [cropping, cropping], [0, 0]]) 8 | elif isinstance(cropping, list) and len(cropping) == 3: 9 | self.cropping = tf.constant([[0, 0], [cropping[0], cropping[0]], [cropping[1], cropping[1]], [cropping[2], cropping[2]], [0, 0]]) 10 | elif isinstance(cropping, list) and len(cropping) == 6: 11 | self.cropping = tf.constant([[0, 0], [cropping[0], cropping[1]], [cropping[2], cropping[3]], [cropping[4], cropping[5]], [0, 0]]) 12 | else: 13 | raise ValueError("Invalid cropping argument. It should be an int or a list of three or six ints.") 14 | 15 | 16 | def __call__(self, data): 17 | shape = tf.shape(data) 18 | size_1 = shape[1] - self.cropping[1][0] - self.cropping[1][1] 19 | size_2 = shape[2] - self.cropping[2][0] - self.cropping[2][1] 20 | size_3 = shape[3] - self.cropping[3][0] - self.cropping[3][1] 21 | return tf.slice(data, begin=[0, self.cropping[1][0], self.cropping[2][0], self.cropping[3][0], 0], size=[-1, size_1, size_2, size_3, -1]) -------------------------------------------------------------------------------- /Note/nn/layer/dropout.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class dropout: 6 | def __init__(self,rate,noise_shape=None,seed=None): 7 | self.rate=rate 8 | self.noise_shape=noise_shape 9 | self.seed=seed 10 | self.train_flag=True 11 | nn.Model.layer_list.append(self) 12 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 13 | nn.Model.layer_eval[nn.Model.name]=[] 14 | nn.Model.layer_eval[nn.Model.name].append(self) 15 | elif nn.Model.name!=None: 16 | nn.Model.layer_eval[nn.Model.name].append(self) 17 | 18 | 19 | def __call__(self,data,training=None): 20 | if training==None: 21 | training=self.train_flag 22 | if training==True: 23 | output=tf.nn.dropout(data,self.rate,noise_shape=self.noise_shape,seed=self.seed) 24 | else: 25 | output=data 26 | return output 27 | -------------------------------------------------------------------------------- /Note/nn/layer/dynamic_tanh.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class DynamicTanh: 6 | def __init__(self, normalized_shape, alpha_init_value=0.5): 7 | self.normalized_shape = normalized_shape 8 | self.alpha_init_value = alpha_init_value 9 | 10 | self.alpha = nn.Parameter(tf.ones(1) * alpha_init_value) 11 | self.weight = nn.Parameter(tf.ones(normalized_shape)) 12 | self.bias = nn.Parameter(tf.zeros(normalized_shape)) 13 | 14 | def __call__(self, x): 15 | x = tf.nn.tanh(self.alpha * x) 16 | x = x * self.weight + self.bias 17 | return x 18 | -------------------------------------------------------------------------------- /Note/nn/layer/flatten.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class flatten: 5 | def __init__(self): 6 | self.output_size=None 7 | 8 | 9 | def __call__(self,data): 10 | data_shape=tf.shape(data) 11 | batch_size=data_shape[0] 12 | num_elements=tf.reduce_prod(data_shape[1:]) 13 | output=tf.reshape(data,[batch_size,num_elements]) 14 | return output -------------------------------------------------------------------------------- /Note/nn/layer/format.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Union 3 | 4 | import tensorflow as tf 5 | 6 | 7 | class Format(str, Enum): 8 | NCHW = 'NCHW' 9 | NHWC = 'NHWC' 10 | NCL = 'NCL' 11 | NLC = 'NLC' 12 | 13 | 14 | FormatT = Union[str, Format] 15 | 16 | 17 | def get_spatial_dim(fmt: FormatT): 18 | fmt = Format(fmt) 19 | if fmt is Format.NLC: 20 | dim = (1,) 21 | elif fmt is Format.NCL: 22 | dim = (2,) 23 | elif fmt is Format.NHWC: 24 | dim = (1, 2) 25 | else: 26 | dim = (2, 3) 27 | return dim 28 | 29 | 30 | def get_channel_dim(fmt: FormatT): 31 | fmt = Format(fmt) 32 | if fmt is Format.NHWC: 33 | dim = 3 34 | elif fmt is Format.NLC: 35 | dim = 2 36 | else: 37 | dim = 1 38 | return dim 39 | 40 | 41 | def nchw_to(x, fmt: Format): 42 | if fmt == Format.NHWC: 43 | x = tf.transpose(x, (0, 2, 3, 1)) 44 | elif fmt == Format.NLC: 45 | N, C, H, W = x.shape 46 | x = tf.transpose(tf.reshape(x, (N, C, -1)), (0, 2, 1)) 47 | elif fmt == Format.NCL: 48 | N, C, H, W = x.shape 49 | x = tf.reshape(x, (N, C, -1)) 50 | return x 51 | 52 | 53 | def nhwc_to(x, fmt: Format): 54 | if fmt == Format.NCHW: 55 | x = tf.transpose(x, (0, 3, 1, 2)) 56 | elif fmt == Format.NLC: 57 | N, H, W, C = x.shape 58 | x = tf.reshape(x, (N, -1, C)) 59 | elif fmt == Format.NCL: 60 | N, H, W, C = x.shape 61 | x = tf.transpose(tf.reshape(x, (N, -1, C)), (0, 2, 1)) 62 | return x -------------------------------------------------------------------------------- /Note/nn/layer/gaussian_dropout.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class gaussian_dropout: 6 | """Apply multiplicative 1-centered Gaussian noise. 7 | 8 | As it is a regularization layer, it is only active at training time. 9 | 10 | Args: 11 | rate: Float, drop probability (as with `Dropout`). 12 | The multiplicative noise will have 13 | standard deviation `sqrt(rate / (1 - rate))`. 14 | seed: Integer, optional random seed to enable deterministic behavior. 15 | 16 | Call arguments: 17 | inputs: Input tensor (of any rank). 18 | training: Python boolean indicating whether the layer should behave in 19 | training mode (adding dropout) or in inference mode (doing nothing). 20 | 21 | Input shape: 22 | Arbitrary. Use the keyword argument `input_shape` 23 | (tuple of integers, does not include the samples axis) 24 | when using this layer as the first layer in a model. 25 | 26 | Output shape: 27 | Same shape as input. 28 | """ 29 | 30 | def __init__(self, rate, seed=7): 31 | self.rate = rate 32 | self.seed = seed 33 | self.random_generator = tf.random.Generator.from_seed(self.seed) 34 | self.train_flag = True 35 | nn.Model.layer_list.append(self) 36 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 37 | nn.Model.layer_eval[nn.Model.name]=[] 38 | nn.Model.layer_eval[nn.Model.name].append(self) 39 | elif nn.Model.name!=None: 40 | nn.Model.layer_eval[nn.Model.name].append(self) 41 | 42 | def __call__(self, data, training=None): 43 | if training==None: 44 | training=self.train_flag 45 | if 0 < self.rate < 1: 46 | def noised(): 47 | stddev = tf.math.sqrt(self.rate / (1.0 - self.rate)) 48 | return data * self.random_generator.normal( 49 | shape=tf.shape(data), 50 | mean=1.0, 51 | stddev=stddev, 52 | dtype=data.dtype, 53 | ) 54 | 55 | return tf.cond(training, noised, lambda: data) 56 | return data 57 | -------------------------------------------------------------------------------- /Note/nn/layer/gaussian_noise.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class gaussian_noise: 5 | """Apply additive zero-centered Gaussian noise. 6 | 7 | This is useful to mitigate overfitting 8 | (you could see it as a form of random data augmentation). 9 | Gaussian Noise (GS) is a natural choice as corruption process 10 | for real valued inputs. 11 | 12 | As it is a regularization layer, it is only active at training time. 13 | 14 | Args: 15 | stddev: Float, standard deviation of the noise distribution. 16 | seed: Integer, optional random seed to enable deterministic behavior. 17 | 18 | Call arguments: 19 | inputs: Input tensor (of any rank). 20 | training: Python boolean indicating whether the layer should behave in 21 | training mode (adding noise) or in inference mode (doing nothing). 22 | 23 | Input shape: 24 | Arbitrary. Use the keyword argument `input_shape` 25 | (tuple of integers, does not include the samples axis) 26 | when using this layer as the first layer in a model. 27 | 28 | Output shape: 29 | Same shape as input. 30 | """ 31 | 32 | def __init__(self, stddev, seed=7): 33 | self.stddev = stddev 34 | self.seed = seed 35 | self.random_generator = tf.random.Generator.from_seed(self.seed) 36 | 37 | def __call__(self, data, train_flag=True): 38 | def noised(): 39 | return data + self.random_generator.normal( 40 | shape=tf.shape(data), 41 | mean=0.0, 42 | stddev=self.stddev, 43 | dtype=data.dtype, 44 | ) 45 | 46 | return tf.cond(train_flag, lambda: noised(), lambda: data) -------------------------------------------------------------------------------- /Note/nn/layer/global_avg_pool1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_avg_pool1d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_mean(data,[1],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/global_avg_pool2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_avg_pool2d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_mean(data,[1,2],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/global_avg_pool3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_avg_pool3d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_mean(data,[1,2,3],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/global_context.py: -------------------------------------------------------------------------------- 1 | """ Global Context Attention Block 2 | 3 | Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond` 4 | - https://arxiv.org/abs/1904.11492 5 | 6 | Official code consulted as reference: https://github.com/xvjiarui/GCNet 7 | 8 | Hacked together by / Copyright 2024 NoteDance 9 | """ 10 | import tensorflow as tf 11 | from Note import nn 12 | 13 | 14 | class GlobalContext: 15 | 16 | def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False, 17 | rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=tf.nn.relu, gate_layer=tf.nn.sigmoid): 18 | self.conv_attn = nn.conv2d(1, kernel_size=1, input_size=channels, use_bias=True) if use_attn else None 19 | 20 | if rd_channels is None: 21 | rd_channels = nn.make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) 22 | if fuse_add: 23 | self.mlp_add = nn.ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=nn.layer_norm) 24 | else: 25 | self.mlp_add = None 26 | if fuse_scale: 27 | self.mlp_scale = nn.ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=nn.layer_norm) 28 | else: 29 | self.mlp_scale = None 30 | 31 | self.gate = gate_layer 32 | self.init_last_zero = init_last_zero 33 | self.reset_parameters() 34 | 35 | def reset_parameters(self): 36 | if self.conv_attn is not None: 37 | nn.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu') 38 | if self.mlp_add is not None: 39 | self.mlp_add.fc2.weight.assign(tf.zeros(self.mlp_add.fc2.weight.shape)) 40 | 41 | def __call__(self, x): 42 | B, H, W, C = x.shape 43 | 44 | if self.conv_attn is not None: 45 | attn = tf.reshape(self.conv_attn(x), (B, 1, H * W)) # (B, 1, H * W) 46 | attn = tf.expand_dims(tf.nn.softmax(attn, axis=-1), axis=3) # (B, 1, H * W, 1) 47 | context = tf.matmul(tf.expand_dims(tf.reshape(tf.transpose(x, (0, 3, 1, 2)), (B, C, H * W)), axis=1), attn) 48 | context = tf.reshape(context, (B, 1, 1, C)) 49 | else: 50 | context = tf.reduce_mean(x, axis=(1, 2), keepdims=True) 51 | 52 | if self.mlp_scale is not None: 53 | mlp_x = self.mlp_scale(context) 54 | x = x * self.gate(mlp_x) 55 | if self.mlp_add is not None: 56 | mlp_x = self.mlp_add(context) 57 | x = x + mlp_x 58 | 59 | return x -------------------------------------------------------------------------------- /Note/nn/layer/global_max_pool1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_max_pool1d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_max(data,[1],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/global_max_pool2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_max_pool2d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_max(data,[1,2],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/global_max_pool3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class global_max_pool3d: 5 | def __init__(self,keepdims=False): 6 | self.keepdims=keepdims 7 | 8 | 9 | def __call__(self,data): 10 | return tf.reduce_max(data,[1,2,3],keepdims=self.keepdims) -------------------------------------------------------------------------------- /Note/nn/layer/grn.py: -------------------------------------------------------------------------------- 1 | """ Global Response Normalization Module 2 | 3 | Based on the GRN layer presented in 4 | `ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808 5 | 6 | This implementation 7 | * works for both NCHW and NHWC tensor layouts 8 | * uses affine param names matching existing torch norm layers 9 | * slightly improves eager mode performance via fused addcmul 10 | 11 | Hacked together by / Copyright 2024 NoteDance 12 | """ 13 | 14 | import tensorflow as tf 15 | from Note import nn 16 | 17 | 18 | class GlobalResponseNorm: 19 | """ Global Response Normalization layer 20 | """ 21 | def __init__(self, dim, eps=1e-6, channels_last=True): 22 | self.eps = eps 23 | if channels_last: 24 | self.spatial_dim = (1, 2) 25 | self.channel_dim = -1 26 | self.wb_shape = (1, 1, 1, -1) 27 | else: 28 | self.spatial_dim = (2, 3) 29 | self.channel_dim = 1 30 | self.wb_shape = (1, -1, 1, 1) 31 | 32 | self.weight = nn.Parameter(tf.zeros(dim)) 33 | self.bias = nn.Parameter(tf.zeros(dim)) 34 | 35 | def __call__(self, x): 36 | x_g = tf.norm(x, ord=2, axis=self.spatial_dim, keepdims=True) 37 | x_n = x_g / (tf.reduce_mean(x_g, axis=self.channel_dim, keepdims=True) + self.eps) 38 | bias_reshaped = tf.reshape(self.bias, self.wb_shape) 39 | weight_reshaped = tf.reshape(self.weight, self.wb_shape) 40 | product = tf.multiply(x, x_n) 41 | weighted_product = tf.multiply(weight_reshaped, product) 42 | return x + tf.add(bias_reshaped, weighted_product) -------------------------------------------------------------------------------- /Note/nn/layer/identity.py: -------------------------------------------------------------------------------- 1 | class identity: 2 | def __init__(self,input_size=None): 3 | self.input_size=input_size 4 | if input_size!=None: 5 | self.output_size=input_size 6 | 7 | 8 | def __call__(self,data): 9 | return data -------------------------------------------------------------------------------- /Note/nn/layer/image_preprocessing/center_crop.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | H_AXIS = -3 5 | W_AXIS = -2 6 | 7 | 8 | class center_crop: 9 | def __init__(self, height, width, dtype='float32'): 10 | self.height = height 11 | self.width = width 12 | self.compute_dtype=dtype 13 | 14 | 15 | def __call__(self, data): 16 | data = tf.cast(data, self.compute_dtype) 17 | input_shape = tf.shape(data) 18 | h_diff = input_shape[H_AXIS] - self.height 19 | w_diff = input_shape[W_AXIS] - self.width 20 | 21 | def center_crop(): 22 | h_start = tf.cast(h_diff / 2, tf.int32) 23 | w_start = tf.cast(w_diff / 2, tf.int32) 24 | return tf.image.crop_to_bounding_box( 25 | data, h_start, w_start, self.height, self.width 26 | ) 27 | 28 | def upsize(): 29 | outputs = tf.image.resize( 30 | data, [self.height, self.width], method=tf.image.ResizeMethod.BICUBIC 31 | ) 32 | # resize will always output float32, so we need to re-cast. 33 | return tf.cast(outputs, self.compute_dtype) 34 | 35 | return tf.cond( 36 | tf.reduce_all((h_diff >= 0, w_diff >= 0)), center_crop, upsize 37 | ) 38 | 39 | 40 | def compute_output_shape(self, input_shape): 41 | input_shape = tf.TensorShape(input_shape).as_list() 42 | input_shape[H_AXIS] = self.height 43 | input_shape[W_AXIS] = self.width 44 | return tf.TensorShape(input_shape) -------------------------------------------------------------------------------- /Note/nn/layer/image_preprocessing/random_crop.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | H_AXIS = -3 4 | W_AXIS = -2 5 | 6 | class random_crop: 7 | """A preprocessing layer which randomly crops images during training. 8 | 9 | During training, this layer will randomly choose a location to crop images 10 | down to a target size. The layer will crop all the images in the same batch 11 | to the same cropping location. 12 | 13 | At inference time, and during training if an input image is smaller than the 14 | target size, the input will be resized and cropped so as to return the 15 | largest possible window in the image that matches the target aspect ratio. 16 | If you need to apply random cropping at inference time, set `training` to 17 | True when calling the layer. 18 | 19 | Input pixel values can be of any range (e.g. `[0., 1.)` or `[0, 255]`) and 20 | of integer or floating point dtype. By default, the layer will output 21 | floats. 22 | 23 | For an overview and full list of preprocessing layers, see the preprocessing 24 | [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers). 25 | 26 | Input shape: 27 | 3D (unbatched) or 4D (batched) tensor with shape: 28 | `(..., height, width, channels)`, in `"channels_last"` format. 29 | 30 | Output shape: 31 | 3D (unbatched) or 4D (batched) tensor with shape: 32 | `(..., target_height, target_width, channels)`. 33 | 34 | Args: 35 | height: Integer, the height of the output shape. 36 | width: Integer, the width of the output shape. 37 | seed: Integer. Used to create a random seed. 38 | """ 39 | 40 | def __init__(self, height, width, seed=7): 41 | self.height = height 42 | self.width = width 43 | self.seed = seed 44 | self.random_generator = tf.random.Generator.from_seed(seed) 45 | 46 | def __call__(self, data, train_flag=True): 47 | input_shape = tf.shape(data) 48 | h_diff = input_shape[H_AXIS] - self.height 49 | w_diff = input_shape[W_AXIS] - self.width 50 | 51 | def random_crop(): 52 | dtype = input_shape.dtype 53 | rands = self.random_generator.uniform( 54 | [2], 0, dtype.max, dtype 55 | ) 56 | h_start = rands[0] % (h_diff + 1) 57 | w_start = rands[1] % (w_diff + 1) 58 | return tf.image.crop_to_bounding_box( 59 | data, h_start, w_start, self.height, self.width 60 | ) 61 | 62 | def resize(): 63 | outputs = tf.image.resize(data, [self.height, self.width], method=tf.image.ResizeMethod.BILINEAR ) 64 | # smart_resize will always output float32, so we need to re-cast. 65 | return tf.cast(outputs, data.dtype) 66 | 67 | return tf.cond( 68 | tf.reduce_all((train_flag, h_diff >= 0, w_diff >= 0)), 69 | random_crop, 70 | resize, 71 | ) -------------------------------------------------------------------------------- /Note/nn/layer/image_preprocessing/rescaling.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class rescaling: 5 | """A preprocessing layer which rescales input values to a new range. 6 | 7 | This layer rescales every value of an input (often an image) by multiplying 8 | by `scale` and adding `offset`. 9 | 10 | For instance: 11 | 12 | 1. To rescale an input in the `[0, 255]` range 13 | to be in the `[0, 1]` range, you would pass `scale=1./255`. 14 | 15 | 2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range, 16 | you would pass `scale=1./127.5, offset=-1`. 17 | 18 | The rescaling is applied both during training and inference. Inputs can be 19 | of integer or floating point dtype, and by default the layer will output 20 | floats. 21 | 22 | For an overview and full list of preprocessing layers, see the preprocessing 23 | [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers). 24 | 25 | Input shape: 26 | Arbitrary. 27 | 28 | Output shape: 29 | Same as input. 30 | 31 | Args: 32 | scale: Float, the scale to apply to the inputs. 33 | offset: Float, the offset to apply to the inputs. 34 | """ 35 | 36 | def __init__(self, scale, offset=0.0): 37 | self.scale = scale 38 | self.offset = offset 39 | 40 | def __call__(self, data): 41 | dtype = data.dtype 42 | scale = tf.cast(self.scale, dtype) 43 | offset = tf.cast(self.offset, dtype) 44 | return tf.cast(data, dtype) * scale + offset -------------------------------------------------------------------------------- /Note/nn/layer/interpolate.py: -------------------------------------------------------------------------------- 1 | """ Interpolation helpers for Note layer 2 | 3 | RegularGridInterpolator from https://github.com/sbarratt/torch_interpolations 4 | Copyright NoteDance, Apache 2.0 license 5 | """ 6 | import tensorflow as tf 7 | from itertools import product 8 | 9 | 10 | class RegularGridInterpolator: 11 | """ Interpolate data defined on a rectilinear grid with even or uneven spacing. 12 | Produces similar results to scipy RegularGridInterpolator or interp2d 13 | in 'linear' mode. 14 | 15 | Taken from https://github.com/sbarratt/torch_interpolations 16 | """ 17 | 18 | def __init__(self, points, values): 19 | self.points = points 20 | self.values = values 21 | 22 | assert isinstance(self.points, tuple) or isinstance(self.points, list) 23 | assert isinstance(self.values, tf.Tensor) 24 | 25 | self.ms = list(self.values.shape) 26 | self.n = len(self.points) 27 | 28 | assert len(self.ms) == self.n 29 | 30 | for i, p in enumerate(self.points): 31 | assert isinstance(p, tf.Tensor) 32 | assert p.shape[0] == self.values.shape[i] 33 | 34 | def __call__(self, points_to_interp): 35 | assert self.points is not None 36 | assert self.values is not None 37 | 38 | assert len(points_to_interp) == len(self.points) 39 | K = points_to_interp[0].shape[0] 40 | for x in points_to_interp: 41 | assert x.shape[0] == K 42 | 43 | idxs = [] 44 | dists = [] 45 | overalls = [] 46 | for p, x in zip(self.points, points_to_interp): 47 | idx_right = tf.searchsorted(p, x, side="right") 48 | idx_right[idx_right >= p.shape[0]] = p.shape[0] - 1 49 | idx_left = tf.clip_by_value(idx_right - 1, 0, tf.shape(p)[0] - 1) 50 | dist_left = x - p[idx_left] 51 | dist_right = p[idx_right] - x 52 | dist_left[dist_left < 0] = 0. 53 | dist_right[dist_right < 0] = 0. 54 | both_zero = (dist_left == 0) & (dist_right == 0) 55 | dist_left[both_zero] = dist_right[both_zero] = 1. 56 | 57 | idxs.append((idx_left, idx_right)) 58 | dists.append((dist_left, dist_right)) 59 | overalls.append(dist_left + dist_right) 60 | 61 | numerator = 0. 62 | for indexer in product([0, 1], repeat=self.n): 63 | as_s = [idx[onoff] for onoff, idx in zip(indexer, idxs)] 64 | bs_s = [dist[1 - onoff] for onoff, dist in zip(indexer, dists)] 65 | numerator += self.values[as_s] * \ 66 | tf.reduce_prod(tf.stack(bs_s), axis=0) 67 | denominator = tf.reduce_prod(tf.stack(overalls), axis=0) 68 | return numerator / denominator -------------------------------------------------------------------------------- /Note/nn/layer/layer_scale.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class LayerScale: 6 | """ LayerScale on tensors with channels in last-dim. 7 | """ 8 | def __init__( 9 | self, 10 | dim: int, 11 | init_values: float = 1e-5, 12 | ) -> None: 13 | self.gamma = nn.Parameter(init_values * tf.ones(dim)) 14 | 15 | def __call__(self, x): 16 | return x * self.gamma 17 | 18 | 19 | class LayerScale2d: 20 | """ LayerScale for tensors with torch 2D NHWC layout. 21 | """ 22 | def __init__( 23 | self, 24 | dim: int, 25 | init_values: float = 1e-5, 26 | ): 27 | self.gamma = nn.Parameter(init_values * tf.ones(dim)) 28 | 29 | def __call__(self, x): 30 | gamma = tf.reshape(self.gamma, (1, 1, 1, -1)) 31 | return x * gamma 32 | -------------------------------------------------------------------------------- /Note/nn/layer/lp_pool1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from typing import Union 4 | 5 | 6 | class lp_pool1d: 7 | r"""Apply a 1D power-average pooling over an input signal composed of several input planes. 8 | 9 | If the sum of all inputs to the power of `p` is 10 | zero, the gradient is set to zero as well. 11 | 12 | """ 13 | def __init__(self, norm_type: Union[int, float], kernel_size, strides = None): 14 | self.norm_type = norm_type 15 | self.kernel_size = kernel_size 16 | self.strides = strides 17 | if strides is not None: 18 | self.avg_pool1d = nn.avg_pool1d(kernel_size, strides, 0) 19 | else: 20 | self.avg_pool1d = nn.avg_pool1d(kernel_size, padding=0) 21 | 22 | def __call__(self, input): 23 | if self.strides is not None: 24 | out = self.avg_pool1d(tf.pow(input, self.norm_type)) 25 | else: 26 | out = self.avg_pool1d(tf.pow(input, self.norm_type)) 27 | 28 | return tf.pow((tf.sign(out) * tf.nn.relu(tf.abs(out))) * self.kernel_size, (1.0 / self.norm_type)) -------------------------------------------------------------------------------- /Note/nn/layer/lp_pool2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from typing import Union 4 | import collections 5 | from itertools import repeat 6 | 7 | 8 | def _ntuple(n, name="parse"): 9 | def parse(x): 10 | if isinstance(x, collections.abc.Iterable): 11 | return tuple(x) 12 | return tuple(repeat(x, n)) 13 | 14 | parse.__name__ = name 15 | return parse 16 | 17 | 18 | _pair = _ntuple(2, "_pair") 19 | 20 | 21 | class lp_pool2d: 22 | r""" 23 | Apply a 2D power-average pooling over an input signal composed of several input planes. 24 | 25 | If the sum of all inputs to the power of `p` is 26 | zero, the gradient is set to zero as well. 27 | 28 | """ 29 | def __init__(self, norm_type: Union[int, float], kernel_size, strides = None): 30 | self.norm_type = norm_type 31 | self.kernel_size = kernel_size 32 | self.strides = strides 33 | if strides is not None: 34 | self.avg_pool2d = nn.avg_pool2d(kernel_size, strides, 0) 35 | else: 36 | self.avg_pool2d = nn.avg_pool2d(kernel_size, padding=0) 37 | 38 | def __call__(self, input): 39 | kw, kh = _pair(self.kernel_size) 40 | if self.strides is not None: 41 | out = self.avg_pool2d(tf.pow(input, self.norm_type)) 42 | else: 43 | out = self.avg_pool2d(tf.pow(input, self.norm_type)) 44 | 45 | return tf.pow((tf.sign(out) * tf.nn.relu(tf.abs(out))) * (kw * kh), (1.0 / self.norm_type)) -------------------------------------------------------------------------------- /Note/nn/layer/lp_pool3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | from typing import Union 4 | import collections 5 | from itertools import repeat 6 | 7 | 8 | def _ntuple(n, name="parse"): 9 | def parse(x): 10 | if isinstance(x, collections.abc.Iterable): 11 | return tuple(x) 12 | return tuple(repeat(x, n)) 13 | 14 | parse.__name__ = name 15 | return parse 16 | 17 | 18 | _triple = _ntuple(3, "_triple") 19 | 20 | 21 | class lp_pool3d: 22 | r""" 23 | Apply a 3D power-average pooling over an input signal composed of several input planes. 24 | 25 | If the sum of all inputs to the power of `p` is 26 | zero, the gradient is set to zero as well. 27 | 28 | """ 29 | def __init__(self, norm_type: Union[int, float], kernel_size, strides = None): 30 | self.norm_type = norm_type 31 | self.kernel_size = kernel_size 32 | self.strides = strides 33 | if strides is not None: 34 | self.avg_pool3d = nn.avg_pool3d(kernel_size, strides, 0) 35 | else: 36 | self.avg_pool3d = nn.avg_pool3d(kernel_size, padding=0) 37 | 38 | def __call__(self, input): 39 | kd, kw, kh = _triple(self.kernel_size) 40 | if self.strides is not None: 41 | out = self.avg_pool3d(tf.pow(input, self.norm_type)) 42 | else: 43 | out = self.avg_pool3d(tf.pow(input, self.norm_type)) 44 | 45 | return tf.pow((tf.sign(out) * tf.nn.relu(tf.abs(out))) * (kd * kw * kh), (1.0 / self.norm_type)) -------------------------------------------------------------------------------- /Note/nn/layer/masked_softmax.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def _large_compatible_negative(tensor_type): 5 | """Large negative number as Tensor. 6 | 7 | This function is necessary because the standard value for epsilon 8 | in this module (-1e9) cannot be represented using `tf.float16`. 9 | 10 | Args: 11 | tensor_type: A dtype to determine the type. 12 | 13 | Returns: 14 | A large negative number. 15 | """ 16 | if tensor_type == tf.float16: 17 | return tf.float16.min 18 | return -1e9 19 | 20 | 21 | class masked_softmax: 22 | """Performs a softmax with optional masking on a tensor. 23 | 24 | Args: 25 | mask_expansion_axes: Any axes that should be padded on the mask tensor. 26 | normalization_axes: On which axes the softmax should perform. 27 | """ 28 | 29 | def __init__(self, 30 | mask_expansion_axes=None, 31 | normalization_axes=None, 32 | ): 33 | self._mask_expansion_axes = mask_expansion_axes 34 | if normalization_axes is None: 35 | self._normalization_axes = (-1,) 36 | else: 37 | self._normalization_axes = normalization_axes 38 | 39 | 40 | def __call__(self, scores, mask=None): 41 | 42 | if mask is not None: 43 | for _ in range(len(scores.shape) - len(mask.shape)): 44 | mask = tf.expand_dims(mask, axis=self._mask_expansion_axes) 45 | 46 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 47 | # masked positions, this operation will create a tensor which is 0.0 for 48 | # positions we want to attend and -1.e9 for masked positions. 49 | adder = (1.0 - tf.cast(mask, scores.dtype)) * _large_compatible_negative( 50 | scores.dtype) 51 | # Since we are adding it to the raw scores before the softmax, this is 52 | # effectively the same as removing these entirely. 53 | scores += adder 54 | 55 | if len(self._normalization_axes) == 1: 56 | return tf.nn.softmax(scores, axis=self._normalization_axes[0]) 57 | else: 58 | return tf.math.exp(scores - tf.math.reduce_logsumexp( 59 | scores, axis=self._normalization_axes, keepdims=True)) -------------------------------------------------------------------------------- /Note/nn/layer/masking.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def masking(inputs,mask_value=0.0,mask_mode="zero"): 5 | inputs=tf.convert_to_tensor(inputs) 6 | dtype=inputs.dtype 7 | mask=tf.equal(inputs,mask_value) 8 | mask=tf.cast(mask,dtype) 9 | mask=tf.broadcast_to(mask,tf.shape(inputs)) 10 | if mask_mode=="zero": 11 | extreme_value=0 12 | elif mask_mode=="min": 13 | if dtype==tf.float32: 14 | extreme_value=tf.float32.min 15 | elif dtype==tf.float64: 16 | extreme_value=tf.float64.min 17 | elif dtype==tf.int32: 18 | extreme_value=tf.int32.min 19 | elif dtype==tf.int64: 20 | extreme_value=tf.int64.min 21 | else: 22 | raise ValueError("Unsupported dtype: {}".format(dtype)) 23 | elif mask_mode=="max": 24 | if dtype==tf.float32: 25 | extreme_value=tf.float32.max 26 | elif dtype==tf.float64: 27 | extreme_value=tf.float64.max 28 | elif dtype==tf.int32: 29 | extreme_value=tf.int32.max 30 | elif dtype==tf.int64: 31 | extreme_value=tf.int64.max 32 | else: 33 | raise ValueError("Unsupported dtype: {}".format(dtype)) 34 | else: 35 | raise ValueError("Invalid mask mode: {}".format(mask_mode)) 36 | outputs=inputs*(1-mask)+mask*extreme_value 37 | return outputs,mask 38 | -------------------------------------------------------------------------------- /Note/nn/layer/matmul_with_margin.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from typing import Tuple 3 | 4 | class matmul_with_margin: 5 | """This layer computs a dot product matrix given two encoded inputs. 6 | 7 | Args: 8 | logit_scale: The scaling factor of dot products when doing training. 9 | logit_margin: The margin value between the positive and negative examples 10 | when doing training. 11 | """ 12 | 13 | def __init__(self, 14 | logit_scale=1.0, 15 | logit_margin=0.0, 16 | ): 17 | self.logit_scale = logit_scale 18 | self.logit_margin = logit_margin 19 | 20 | def __call__(self, left_encoded: tf.Tensor, 21 | right_encoded: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: 22 | batch_size = left_encoded[0] 23 | 24 | # Left -> Right dot product. 25 | left_dot_products = tf.matmul( 26 | left_encoded, right_encoded, transpose_b=True) 27 | 28 | self.left_logits = self.logit_scale * ( 29 | left_dot_products - self.logit_margin * tf.eye(batch_size)) 30 | 31 | # Right -> Left dot product. 32 | self.right_logits = tf.transpose(self.left_logits) 33 | 34 | return (self.left_logits, self.right_logits) -------------------------------------------------------------------------------- /Note/nn/layer/max_pool1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class max_pool1d: 6 | def __init__(self,kernel_size=2,strides=None,padding=0): 7 | self.kernel_size=kernel_size 8 | self.strides=strides if strides!=None else kernel_size 9 | self.padding=padding 10 | if not isinstance(padding,str): 11 | self.zeropadding1d=nn.zeropadding1d(padding=padding) 12 | 13 | 14 | def __call__(self,data): 15 | if not isinstance(self.padding,str): 16 | data=self.zeropadding1d(data) 17 | padding='VALID' 18 | else: 19 | padding=self.padding 20 | return tf.nn.max_pool1d(data,ksize=self.kernel_size,strides=self.strides,padding=padding) 21 | -------------------------------------------------------------------------------- /Note/nn/layer/max_pool2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class max_pool2d: 6 | def __init__(self,kernel_size=(2,2),strides=None,padding=0): 7 | self.kernel_size=kernel_size 8 | self.strides=strides if strides!=None else kernel_size 9 | self.padding=padding 10 | if not isinstance(padding,str): 11 | self.zeropadding2d=nn.zeropadding2d(padding=padding) 12 | 13 | 14 | def __call__(self,data): 15 | if not isinstance(self.padding,str): 16 | data=self.zeropadding2d(data) 17 | padding='VALID' 18 | else: 19 | padding=self.padding 20 | return tf.nn.max_pool2d(data,ksize=self.kernel_size,strides=self.strides,padding=padding) 21 | -------------------------------------------------------------------------------- /Note/nn/layer/max_pool3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class max_pool3d: 6 | def __init__(self,kernel_size=(2,2,2),strides=None,padding=0): 7 | self.kernel_size=kernel_size 8 | self.strides=strides if strides!=None else kernel_size 9 | self.padding=padding 10 | if not isinstance(padding,str): 11 | self.zeropadding3d=nn.zeropadding3d(padding=padding) 12 | 13 | 14 | def __call__(self,data): 15 | if not isinstance(self.padding,str): 16 | data=self.zeropadding3d(data) 17 | padding='VALID' 18 | else: 19 | padding=self.padding 20 | return tf.nn.max_pool3d(data,ksize=self.kernel_size,strides=self.strides,padding=padding) 21 | -------------------------------------------------------------------------------- /Note/nn/layer/maximum.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class maximum: 5 | def __init__(self): 6 | self.save_data_count=None 7 | 8 | 9 | def __call__(self,data): 10 | if self.save_data_count!=None: 11 | output=data.pop(0) 12 | for i in range(1,self.save_data_count): 13 | output=tf.maximum(output,data.pop(0)) 14 | else: 15 | output=data[0] 16 | for i in range(1,len(data)): 17 | output=tf.maximum(output,data[i]) 18 | return output -------------------------------------------------------------------------------- /Note/nn/layer/maxout.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class maxout: 5 | """Applies Maxout to the input. 6 | 7 | "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron 8 | Courville, Yoshua Bengio. https://arxiv.org/abs/1302.4389 9 | 10 | Usually the operation is performed in the filter/channel dimension. This 11 | can also be used after Dense layers to reduce number of features. 12 | 13 | Args: 14 | input_shape: Shape of the input tensor. 15 | num_units: Specifies how many features will remain after maxout 16 | in the `axis` dimension (usually channel). 17 | This must be a factor of number of features. 18 | axis: The dimension where max pooling will be performed. Default is the 19 | last dimension. 20 | 21 | Input shape: 22 | nD tensor with shape: `(batch_size, ..., axis_dim, ...)`. 23 | 24 | Output shape: 25 | nD tensor with shape: `(batch_size, ..., num_units, ...)`. 26 | """ 27 | 28 | def __init__(self, num_units: int, axis: int = -1, input_shape=None): 29 | self.num_units = num_units 30 | self.axis = axis 31 | self.input_shape=input_shape 32 | if input_shape is not None: 33 | self.num_channels = self.input_shape[axis] 34 | if not isinstance(self.num_channels, tf.Tensor) and self.num_channels % self.num_units: 35 | raise ValueError( 36 | "number of features({}) is not " 37 | "a multiple of num_units({})".format(self.num_channels, self.num_units) 38 | ) 39 | 40 | if axis < 0: 41 | self.axis_ = axis + len(self.input_shape) 42 | else: 43 | self.axis_ = axis 44 | assert self.axis_ >= 0, "Find invalid axis: {}".format(self.axis) 45 | 46 | 47 | def __call__(self,data): 48 | if self.input_shape is None: 49 | self.input_shape=list(data.shape) 50 | num_channels = self.input_shape[self.axis] 51 | if not isinstance(num_channels, tf.Tensor) and num_channels % self.num_units: 52 | raise ValueError( 53 | "number of features({}) is not " 54 | "a multiple of num_units({})".format(num_channels, self.num_units) 55 | ) 56 | 57 | if self.axis < 0: 58 | axis = self.axis + len(self.input_shape) 59 | else: 60 | axis = self.axis 61 | assert axis >= 0, "Find invalid axis: {}".format(self.axis) 62 | else: 63 | axis=self.axis_ 64 | num_channels=self.num_channels 65 | 66 | expand_shape = self.input_shape[:] 67 | expand_shape[axis] = self.num_units 68 | k = num_channels // self.num_units 69 | expand_shape.insert(axis, k) 70 | 71 | output = tf.math.reduce_max( 72 | tf.reshape(data, expand_shape), axis, keepdims=False 73 | ) 74 | return output -------------------------------------------------------------------------------- /Note/nn/layer/minimum.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class minimum: 5 | def __init__(self): 6 | self.save_data_count=None 7 | 8 | 9 | def __call__(self,data): 10 | if self.save_data_count!=None: 11 | output=data.pop(0) 12 | for i in range(1,self.save_data_count): 13 | output=tf.minimum(output,data.pop(0)) 14 | else: 15 | output=data[0] 16 | for i in range(1,len(data)): 17 | output=tf.minimum(output,data[i]) 18 | return output -------------------------------------------------------------------------------- /Note/nn/layer/multiply.py: -------------------------------------------------------------------------------- 1 | class multiply: 2 | def __init__(self): 3 | self.save_data_count=None 4 | 5 | 6 | def __call__(self,data): 7 | if self.save_data_count!=None: 8 | output=data.pop(0) 9 | for i in range(1,self.save_data_count): 10 | output=output*data.pop(0) 11 | else: 12 | output=data[0] 13 | for i in range(1,len(data)): 14 | output=output*data[i] 15 | return output -------------------------------------------------------------------------------- /Note/nn/layer/patch_dropout.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class PatchDropout: 6 | """ 7 | https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220 8 | """ 9 | 10 | def __init__( 11 | self, 12 | prob: float = 0.5, 13 | num_prefix_tokens: int = 1, 14 | ordered: bool = False, 15 | return_indices: bool = False, 16 | ): 17 | assert 0 <= prob < 1. 18 | self.prob = prob 19 | self.num_prefix_tokens = num_prefix_tokens # exclude CLS token (or other prefix tokens) 20 | self.ordered = ordered 21 | self.return_indices = return_indices 22 | self.train_flag=True 23 | nn.Model.layer_list.append(self) 24 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 25 | nn.Model.layer_eval[nn.Model.name]=[] 26 | nn.Model.layer_eval[nn.Model.name].append(self) 27 | elif nn.Model.name!=None: 28 | nn.Model.layer_eval[nn.Model.name].append(self) 29 | 30 | def __call__(self, x, training=None): 31 | if training==None: 32 | training=self.train_flag 33 | if not training or self.prob == 0.: 34 | if self.return_indices: 35 | return x, None 36 | return x 37 | 38 | if self.num_prefix_tokens: 39 | prefix_tokens, x = x[:, :self.num_prefix_tokens], x[:, self.num_prefix_tokens:] 40 | else: 41 | prefix_tokens = None 42 | 43 | B = x.shape[0] 44 | L = x.shape[1] 45 | num_keep = max(1, int(L * (1. - self.prob))) 46 | keep_indices = tf.argsort(tf.random.normal((B, L)), axis=-1)[:, :num_keep] 47 | if self.ordered: 48 | # NOTE does not need to maintain patch order in typical transformer use, 49 | # but possibly useful for debug / visualization 50 | keep_indices = tf.sort(keep_indices, axis=-1) 51 | x = tf.gather(x, keep_indices, axis=1, batch_dims=1) 52 | 53 | if prefix_tokens is not None: 54 | x = tf.concat((prefix_tokens, x), axis=1) 55 | 56 | if self.return_indices: 57 | return x, keep_indices 58 | return x 59 | -------------------------------------------------------------------------------- /Note/nn/layer/permute.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class permute: 5 | """Permutes the dimensions of the input according to a given pattern. 6 | 7 | Useful e.g. connecting RNNs and convnets. 8 | 9 | Args: 10 | dims: Tuple of integers. Permutation pattern does not include the 11 | samples dimension. Indexing starts at 1. 12 | For instance, `(2, 1)` permutes the first and second dimensions 13 | of the input. 14 | 15 | Input shape: 16 | Arbitrary. Use the keyword argument `input_shape` 17 | (tuple of integers, does not include the samples axis) 18 | when using this layer as the first layer in a model. 19 | 20 | Output shape: 21 | Same as the input shape, but with the dimensions re-ordered according 22 | to the specified pattern. 23 | """ 24 | 25 | def __init__(self, dims): 26 | self.dims = tuple(dims) 27 | if sorted(dims) != list(range(1, len(dims) + 1)): 28 | raise ValueError( 29 | "Invalid permutation argument `dims` for Permute Layer. " 30 | "The set of indices in `dims` must be consecutive and start " 31 | f"from 1. Received dims={dims}" 32 | ) 33 | 34 | 35 | def __call__(self, data): 36 | return tf.transpose(data, perm=(0,) + self.dims) -------------------------------------------------------------------------------- /Note/nn/layer/pos_embed.py: -------------------------------------------------------------------------------- 1 | """ Position Embedding Utilities 2 | 3 | Hacked together by / Copyright 2024 NoteDance 4 | """ 5 | import math 6 | from typing import List, Optional 7 | 8 | import tensorflow as tf 9 | from Note import nn 10 | 11 | 12 | def resample_abs_pos_embed( 13 | posemb, 14 | new_size: List[int], 15 | old_size: Optional[List[int]] = None, 16 | num_prefix_tokens: int = 1, 17 | interpolation: str = 'bicubic', 18 | antialias: bool = True, 19 | verbose: bool = False, 20 | ): 21 | # sort out sizes, assume square if old size not provided 22 | num_pos_tokens = posemb.shape[1] 23 | num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens 24 | if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]: 25 | return posemb 26 | 27 | if old_size is None: 28 | hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens)) 29 | old_size = hw, hw 30 | 31 | if num_prefix_tokens: 32 | posemb_prefix, posemb = posemb[:, :num_prefix_tokens], posemb[:, num_prefix_tokens:] 33 | else: 34 | posemb_prefix, posemb = None, posemb 35 | 36 | # do the interpolation 37 | embed_dim = posemb.shape[-1] 38 | orig_dtype = posemb.dtype 39 | posemb = tf.cast(posemb, 'float32') # interpolate needs float32 40 | posemb = tf.transpose(tf.reshape(posemb, (1, old_size[0], old_size[1], -1)), (0, 3, 1, 2)) 41 | posemb = nn.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias) 42 | posemb = tf.reshape(tf.transpose(posemb, (0, 2, 3, 1)), (1, -1, embed_dim)) 43 | posemb = tf.cast(posemb, orig_dtype) 44 | 45 | # add back extra (class, etc) prefix tokens 46 | if posemb_prefix is not None: 47 | posemb = tf.concat([posemb_prefix, posemb], axis=1) 48 | 49 | return posemb 50 | 51 | 52 | def resample_abs_pos_embed_nhwc( 53 | posemb, 54 | new_size: List[int], 55 | interpolation: str = 'bicubic', 56 | antialias: bool = True, 57 | verbose: bool = False, 58 | ): 59 | if new_size[0] == posemb.shape[-3] and new_size[1] == posemb.shape[-2]: 60 | return posemb 61 | 62 | orig_dtype = posemb.dtype 63 | posemb = tf.cast(posemb, 'float32') 64 | posemb = tf.reshape(posemb, (1, posemb.shape[-3], posemb.shape[-2], posemb.shape[-1])) 65 | posemb = nn.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias) 66 | posemb = tf.cast(posemb, orig_dtype) 67 | 68 | return posemb 69 | -------------------------------------------------------------------------------- /Note/nn/layer/position_embedding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class position_embedding: 6 | """Creates a positional embedding. 7 | 8 | Args: 9 | max_length: The maximum size of the dynamic sequence. 10 | initializer: The initializer to use for the embedding weights. Defaults to 11 | "glorot_uniform". 12 | seq_axis: The axis of the input tensor where we add the embeddings. 13 | 14 | Reference: This layer creates a positional embedding as described in 15 | [BERT: Pre-training of Deep Bidirectional Transformers for Language 16 | Understanding](https://arxiv.org/abs/1810.04805). 17 | """ 18 | 19 | def __init__(self, 20 | max_length, 21 | input_size=None, 22 | initializer="Xavier", 23 | seq_axis=1, 24 | dtype='float32' 25 | ): 26 | 27 | if max_length is None: 28 | raise ValueError( 29 | "`max_length` must be an Integer, not `None`." 30 | ) 31 | self.max_length = max_length 32 | self.input_size = input_size 33 | self.initializer = initializer 34 | self._seq_axis = seq_axis 35 | self.dtype = dtype 36 | if input_size is not None: 37 | self._position_embeddings = nn.initializer([max_length, input_size], initializer, dtype) 38 | self.param=[self._position_embeddings] 39 | 40 | 41 | def build(self): 42 | self._position_embeddings = nn.initializer([self.max_length, self.input_size], self.initializer, self.dtype) 43 | self.param=[self._position_embeddings] 44 | return 45 | 46 | 47 | def __call__(self, data): 48 | input_shape = tf.shape(data) 49 | actual_seq_len = input_shape[self._seq_axis] 50 | position_embeddings = self._position_embeddings[:actual_seq_len, :] 51 | new_shape = [1 for _ in data.get_shape().as_list()] 52 | new_shape[self._seq_axis] = actual_seq_len 53 | new_shape[-1] = position_embeddings.get_shape().as_list()[-1] 54 | position_embeddings = tf.reshape(position_embeddings, new_shape) 55 | return tf.broadcast_to(position_embeddings, input_shape) -------------------------------------------------------------------------------- /Note/nn/layer/repeat_vector.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def repeat(x, n): 5 | """Repeats a 2D tensor. 6 | 7 | if `x` has shape (samples, dim) and `n` is `2`, 8 | the output will have shape `(samples, 2, dim)`. 9 | 10 | Args: 11 | x: Tensor or variable. 12 | n: Python integer, number of times to repeat. 13 | 14 | Returns: 15 | A tensor. 16 | 17 | Example: 18 | 19 | >>> b = tf.constant([[1, 2], [3, 4]]) 20 | >>> b 21 | 24 | >>> tf.keras.backend.repeat(b, n=2) 25 | 30 | 31 | """ 32 | x = tf.expand_dims(x, 1) 33 | pattern = tf.stack([1, n, 1]) 34 | return tf.tile(x, pattern) 35 | 36 | 37 | class repeat_vector: 38 | """Repeats the input n times. 39 | 40 | Args: 41 | n: Integer, repetition factor. 42 | Input shape: 2D tensor of shape `(num_samples, features)`. 43 | Output shape: 3D tensor of shape `(num_samples, n, features)`. 44 | """ 45 | 46 | def __init__(self, n): 47 | self.n = n 48 | if not isinstance(n, int): 49 | raise TypeError( 50 | f"Expected an integer value for `n`, got {type(n)}." 51 | ) 52 | 53 | def __call__(self, data): 54 | return repeat(data, self.n) -------------------------------------------------------------------------------- /Note/nn/layer/reversible_residual.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class reversible_residual: 5 | def __init__(self, f, g): 6 | self.f=f 7 | self.g=g 8 | 9 | 10 | def __call__(self, data): 11 | data1, data2 = tf.split(data, 2, axis=-1) # split the input into two halves 12 | output1 = data1 + self.f(data2) # compute the first output half 13 | output2 = data2 + self.g(output1) # compute the second output half 14 | output = tf.concat([output1, output2], axis=-1) # concatenate the output halves 15 | return output -------------------------------------------------------------------------------- /Note/nn/layer/select_topk.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class select_topk: 5 | """Select top-k + random-k tokens according to importance.""" 6 | 7 | def __init__(self, 8 | top_k=None, 9 | random_k=None, 10 | ): 11 | self._top_k = top_k 12 | self._random_k = random_k 13 | 14 | 15 | def __call__(self, data): 16 | if self._random_k is None: 17 | # Pure top-k, not randomness. 18 | pos = tf.argsort(data, direction="DESCENDING") 19 | selected = tf.slice(pos, [0, 0], [-1, self._top_k]) 20 | not_selected = tf.slice(pos, [0, self._top_k], [-1, -1]) 21 | elif self._top_k is None: 22 | # Pure randomness, no top-k. 23 | pos = tf.argsort(tf.random.uniform(shape=tf.shape(data)), 24 | direction="DESCENDING") 25 | selected = tf.slice(pos, [0, 0], [-1, self._random_k]) 26 | not_selected = tf.slice(pos, [0, self._random_k], [-1, -1]) 27 | else: 28 | # Top-k plus randomness. 29 | pos = tf.argsort(data, direction="DESCENDING") 30 | selected_top_k = tf.slice(pos, [0, 0], [-1, self._top_k]) 31 | pos_left = tf.slice(pos, [0, self._top_k], [-1, -1]) 32 | 33 | # Randomly shuffle pos_left 34 | sort_index = tf.argsort( 35 | tf.random.uniform(shape=tf.shape(pos_left)), 36 | direction="DESCENDING") 37 | pos_left = tf.gather(pos_left, sort_index, batch_dims=1, axis=1) 38 | 39 | selected_rand = tf.slice(pos_left, [0, 0], [-1, self._random_k]) 40 | not_selected = tf.slice(pos_left, [0, self._random_k], [-1, -1]) 41 | 42 | selected = tf.concat([selected_top_k, selected_rand], axis=1) 43 | 44 | # Return the indices of selected and not-selected tokens. 45 | return selected, not_selected -------------------------------------------------------------------------------- /Note/nn/layer/self_attention_mask.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from typing import Optional 3 | 4 | 5 | class self_attention_mask: 6 | """Create 3D attention mask from a 2D tensor mask. 7 | 8 | inputs[0]: from_tensor: 2D or 3D Tensor of shape 9 | [batch_size, from_seq_length, ...]. 10 | inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 11 | 12 | Returns: 13 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 14 | """ 15 | 16 | def __call__(self, inputs, to_mask=None): 17 | if isinstance(inputs, list) and to_mask is None: 18 | to_mask = inputs[1] 19 | inputs = inputs[0] 20 | return get_mask(inputs, to_mask) 21 | 22 | 23 | def get_mask(inputs: tf.Tensor, 24 | to_mask: tf.Tensor, 25 | dtype: Optional[tf.DType] = None) -> tf.Tensor: 26 | """Gets a 3D self-attention mask. 27 | 28 | Args: 29 | inputs: from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, 30 | ...]. 31 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 32 | dtype: the output Tensor dtype. 33 | 34 | Returns: 35 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 36 | """ 37 | from_shape = tf.shape(inputs) 38 | batch_size = from_shape[0] 39 | from_seq_length = from_shape[1] 40 | dtype = inputs.dtype if dtype is None else dtype 41 | 42 | to_shape = tf.shape(to_mask) 43 | to_seq_length = to_shape[1] 44 | 45 | to_mask = tf.cast( 46 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), dtype=dtype) 47 | 48 | return tf.broadcast_to(to_mask, [batch_size, from_seq_length, to_seq_length]) -------------------------------------------------------------------------------- /Note/nn/layer/softmax.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def _large_compatible_negative(tensor_type): 5 | """Large negative number as Tensor. 6 | 7 | This function is necessary because the standard value for epsilon 8 | in this module (-1e9) cannot be represented using tf.float16 9 | 10 | Args: 11 | tensor_type: a dtype to determine the type. 12 | 13 | Returns: 14 | a large negative number. 15 | """ 16 | # In case of dtype=float16 (e.g., for mixed-precision), the largest 17 | # negative number (dtypes.float16.min) is divided by 2, in order to 18 | # avoid overflows when summing negative inputs. 19 | if tensor_type == tf.float16: 20 | return tf.float16.min / 2.0 21 | return -1e9 22 | 23 | 24 | class softmax: 25 | """Softmax activation function. 26 | 27 | Input shape: 28 | Arbitrary. Use the keyword argument `input_shape` 29 | (tuple of integers, does not include the samples axis) 30 | when using this layer as the first layer in a model. 31 | 32 | Output shape: 33 | Same shape as the input. 34 | 35 | Args: 36 | axis: Integer, or list of Integers, axis along which the softmax 37 | normalization is applied. 38 | Call arguments: 39 | inputs: The inputs, or logits to the softmax layer. 40 | mask: A boolean mask of the same shape as `inputs`. The mask 41 | specifies 1 to keep and 0 to mask. Defaults to `None`. 42 | 43 | 44 | Returns: 45 | Softmaxed output with the same shape as `inputs`. 46 | """ 47 | 48 | def __init__(self, axis=-1): 49 | self.axis = axis 50 | 51 | 52 | def __call__(self, inputs, mask=None): 53 | if mask is not None: 54 | # Since mask is 1.0 for positions we want to keep and 0.0 for masked 55 | # positions, this operation will create a tensor which is 0.0 for 56 | # positions we want to attend and -1e.9 for masked positions. 57 | adder = (1.0 - tf.cast(mask, inputs.dtype)) * ( 58 | _large_compatible_negative(inputs.dtype) 59 | ) 60 | 61 | # Since we are adding it to the raw scores before the softmax, this 62 | # is effectively the same as removing these entirely. 63 | inputs += adder 64 | if isinstance(self.axis, (tuple, list)): 65 | if len(self.axis) > 1: 66 | return tf.exp( 67 | inputs 68 | - tf.reduce_logsumexp(inputs, axis=self.axis, keepdims=True) 69 | ) 70 | else: 71 | return tf.nn.softmax(inputs, axis=self.axis[0]) 72 | return tf.nn.softmax(inputs, axis=self.axis) -------------------------------------------------------------------------------- /Note/nn/layer/space_to_depth.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SpaceToDepth: 5 | def __init__(self, block_size=4): 6 | assert block_size == 4 7 | self.bs = block_size 8 | 9 | def __call__(self, x): 10 | N, H, W, C = x.shape 11 | x = tf.reshape(x, (N, H // self.bs, self.bs, W // self.bs, self.bs, C)) # (N, H//bs, bs, W//bs, bs, C) 12 | x = tf.transpose(x, (0, 1, 3, 2, 4, 5)) # (N, bs, bs, C, H//bs, W//bs) 13 | x = tf.reshape(x, (N, H // self.bs, W // self.bs, C * self.bs * self.bs)) # (N, H//bs, W//bs, C*bs^2) 14 | return x 15 | 16 | 17 | class DepthToSpace: 18 | 19 | def __init__(self, block_size): 20 | self.bs = block_size 21 | 22 | def __call__(self, x): 23 | N, H, W, C = x.shape 24 | x = tf.reshape(x, (N, H, W, self.bs, self.bs, C // (self.bs ** 2))) # (N, H, W, bs, bs, C//bs^2) 25 | x = tf.transpose(x, (0, 1, 3, 2, 4, 5)) # (N, H, bs, W, bs, C//bs^2) 26 | x = tf.reshape(x, (N, H * self.bs, W * self.bs, C // (self.bs ** 2))) # (N, H * bs, W * bs, C//bs^2) 27 | return x -------------------------------------------------------------------------------- /Note/nn/layer/spatial_dropout1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class spatial_dropout1d: 6 | """Spatial 1D dropout layer. 7 | 8 | This layer randomly sets 1D feature maps along the last dimension to zero with a 9 | frequency of `rate` at each step during training time in order to prevent overfitting. 10 | Inputs not set to zero are scaled up by 1/(1 - rate) such that the sum over all inputs 11 | is unchanged. 12 | 13 | Arguments: 14 | rate: Float between 0 and 1. Fraction of the input units to drop. 15 | seed: A Python integer to use as random seed. 16 | 17 | Call arguments: 18 | daat: A 3D tensor. 19 | train_flag: A Python boolean indicating whether to apply dropout to the inputs or not. 20 | If True, the layer will randomly set 1D feature maps to zero with a frequency of rate. 21 | If False, the layer will return the inputs unchanged. 22 | 23 | References: 24 | - Efficient Object Localization Using Convolutional Networks 25 | """ 26 | 27 | def __init__(self, rate, seed=7): 28 | self.rate = rate 29 | self.seed = seed 30 | self.train_flag = True 31 | nn.Model.layer_list.append(self) 32 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 33 | nn.Model.layer_eval[nn.Model.name]=[] 34 | nn.Model.layer_eval[nn.Model.name].append(self) 35 | elif nn.Model.name!=None: 36 | nn.Model.layer_eval[nn.Model.name].append(self) 37 | 38 | def __call__(self, data, training=None): 39 | if training==None: 40 | training=self.train_flag 41 | def dropped_inputs(): 42 | # Generate a mask with shape (batch_size, 1, channels) 43 | noise_shape = (tf.shape(data)[0], 1, tf.shape(data)[2]) 44 | mask = tf.random.stateless_binomial(noise_shape, seed=[self.seed, 0], counts=1, probs=(1 - self.rate), 45 | output_dtype=data.dtype) 46 | # Scale up the input by 1/(1 - rate) and apply the mask 47 | return data * mask * (1.0 / (1.0 - self.rate)) 48 | 49 | return tf.cond(training, dropped_inputs, lambda: data) 50 | -------------------------------------------------------------------------------- /Note/nn/layer/spatial_dropout2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | class spatial_dropout2d: 5 | """Spatial 2D dropout layer. 6 | 7 | This layer randomly sets 2D feature maps along the last two dimensions to zero with a 8 | frequency of `rate` at each step during training time in order to prevent overfitting. 9 | Inputs not set to zero are scaled up by 1/(1 - rate) such that the sum over all inputs 10 | is unchanged. 11 | 12 | Arguments: 13 | rate: Float between 0 and 1. Fraction of the input units to drop. 14 | seed: A Python integer to use as random seed. 15 | 16 | Call arguments: 17 | data: A 4D tensor. 18 | train_flag: A Python boolean indicating whether to apply dropout to the inputs or not. 19 | If True, the layer will randomly set 2D feature maps to zero with a frequency of rate. 20 | If False, the layer will return the inputs unchanged. 21 | 22 | References: 23 | - Efficient Object Localization Using Convolutional Networks 24 | """ 25 | 26 | def __init__(self, rate, seed=7): 27 | self.rate = rate 28 | self.seed = seed 29 | self.train_flag = True 30 | nn.Model.layer_list.append(self) 31 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 32 | nn.Model.layer_eval[nn.Model.name]=[] 33 | nn.Model.layer_eval[nn.Model.name].append(self) 34 | elif nn.Model.name!=None: 35 | nn.Model.layer_eval[nn.Model.name].append(self) 36 | 37 | def __call__(self, data, training=None): 38 | if training==None: 39 | training=self.train_flag 40 | def dropped_inputs(): 41 | # Generate a mask with shape (batch_size, 1, 1, channels) 42 | noise_shape = (tf.shape(data)[0], 1, 1, tf.shape(data)[3]) 43 | mask = tf.random.stateless_binomial(noise_shape, seed=[self.seed, 0], counts=1, probs=(1 - self.rate), 44 | output_dtype=data.dtype) 45 | # Scale up the input by 1/(1 - rate) and apply the mask 46 | return data * mask * (1.0 / (1.0 - self.rate)) 47 | 48 | return tf.cond(training, dropped_inputs, lambda: data) 49 | -------------------------------------------------------------------------------- /Note/nn/layer/spatial_dropout3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class spatial_dropout3d: 6 | """Spatial 3D dropout layer. 7 | 8 | This layer randomly sets 3D feature maps along the last three dimensions to zero with a 9 | frequency of `rate` at each step during training time in order to prevent overfitting. 10 | Inputs not set to zero are scaled up by 1/(1 - rate) such that the sum over all inputs 11 | is unchanged. 12 | 13 | Arguments: 14 | rate: Float between 0 and 1. Fraction of the input units to drop. 15 | seed: A Python integer to use as random seed. 16 | 17 | Call arguments: 18 | data: A 5D tensor. 19 | train_flag: A Python boolean indicating whether to apply dropout to the inputs or not. 20 | If True, the layer will randomly set 3D feature maps to zero with a frequency of rate. 21 | If False, the layer will return the inputs unchanged. 22 | 23 | References: 24 | - Efficient Object Localization Using Convolutional Networks 25 | """ 26 | 27 | def __init__(self, rate, seed=None): 28 | self.rate = rate 29 | self.seed = seed 30 | self.train_flag = True 31 | nn.Model.layer_list.append(self) 32 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 33 | nn.Model.layer_eval[nn.Model.name]=[] 34 | nn.Model.layer_eval[nn.Model.name].append(self) 35 | elif nn.Model.name!=None: 36 | nn.Model.layer_eval[nn.Model.name].append(self) 37 | 38 | def __call__(self, data, training=None): 39 | if training==None: 40 | training=self.train_flag 41 | def dropped_inputs(): 42 | # Generate a mask with shape (batch_size, 1, 1, 1, channels) 43 | noise_shape = (tf.shape(data)[0], 1, 1, 1, tf.shape(data)[4]) 44 | mask = tf.random.stateless_binomial(noise_shape, seed=[self.seed, 0], counts=1, probs=(1 - self.rate), 45 | output_dtype=data.dtype) 46 | # Scale up the input by 1/(1 - rate) and apply the mask 47 | return data * mask * (1.0 / (1.0 - self.rate)) 48 | 49 | return tf.cond(training, dropped_inputs, lambda: data) 50 | -------------------------------------------------------------------------------- /Note/nn/layer/stochastic_depth.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note import nn 3 | 4 | 5 | class stochastic_depth: 6 | def __init__(self, drop_path_rate): 7 | self.drop_path_rate=drop_path_rate 8 | self.train_flag=True 9 | nn.Model.layer_list.append(self) 10 | if nn.Model.name!=None and nn.Model.name not in nn.Model.layer_eval: 11 | nn.Model.layer_eval[nn.Model.name]=[] 12 | nn.Model.layer_eval[nn.Model.name].append(self) 13 | elif nn.Model.name!=None: 14 | nn.Model.layer_eval[nn.Model.name].append(self) 15 | 16 | 17 | def __call__(self, x, training=None): 18 | if training==None: 19 | training=self.train_flag 20 | if training: 21 | keep_prob = 1 - self.drop_path_rate 22 | shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) 23 | random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=x.dtype) 24 | random_tensor = tf.floor(random_tensor) 25 | return (x / keep_prob) * random_tensor 26 | return x 27 | -------------------------------------------------------------------------------- /Note/nn/layer/subtract.py: -------------------------------------------------------------------------------- 1 | class subtract: 2 | def __call__(self,data1,data2): 3 | return data1-data2 -------------------------------------------------------------------------------- /Note/nn/layer/thresholded_relu.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class thresholded_relu: 4 | """DEPRECATED.""" 5 | 6 | def __init__(self, theta=1.0, dtype='float32'): 7 | if theta is None: 8 | raise ValueError( 9 | "Theta of a Thresholded ReLU layer cannot be None, expecting a " 10 | f"float. Received: {theta}" 11 | ) 12 | if theta < 0: 13 | raise ValueError( 14 | "The theta value of a Thresholded ReLU layer " 15 | f"should be >=0. Received: {theta}" 16 | ) 17 | self.theta = tf.convert_to_tensor(theta, dtype=dtype) 18 | self.dtype = dtype 19 | 20 | def __call__(self, data): 21 | if data.dtype!=self.dtype: 22 | data = tf.cast(data, self.dtype) 23 | return data * tf.greater(data, self.theta) -------------------------------------------------------------------------------- /Note/nn/layer/unfold.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.layer.zeropadding2d import zeropadding2d 3 | 4 | 5 | class unfold: 6 | def __init__(self, kernel, stride=1, padding=0, dilation=1): 7 | self.kernel = kernel 8 | self.stride = stride 9 | self.padding = padding 10 | self.dilation = dilation 11 | self.zeropadding2d = zeropadding2d(padding=padding) 12 | 13 | def __call__(self, x): 14 | x = self.zeropadding2d(x) 15 | x = tf.image.extract_patches(x, sizes=[1, self.kernel, self.kernel, 1], strides=[1, self.stride, self.stride, 1], rates=[1, self.dilation, self.dilation, 1], padding='VALID') 16 | x = tf.reshape(x, (x.shape[0], -1, x.shape[-1])) 17 | return x 18 | -------------------------------------------------------------------------------- /Note/nn/layer/unit_norm.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class unit_norm: 5 | """Unit normalization layer. 6 | 7 | Normalize a batch of inputs so that each input in the batch has a L2 norm 8 | equal to 1 (across the axes specified in `axis`). 9 | 10 | Example: 11 | 12 | >>> data = tf.constant(np.arange(6).reshape(2, 3), dtype=tf.float32) 13 | >>> normalized_data = Note.nn.layer.unit_normalization.unit_normalization().output(data) 14 | >>> print(tf.reduce_sum(normalized_data[0, :] ** 2).numpy()) 15 | 1.0 16 | 17 | Args: 18 | axis: Integer or list/tuple. The axis or axes to normalize across. 19 | Typically this is the features axis or axes. The left-out axes are 20 | typically the batch axis or axes. Defaults to `-1`, the last dimension 21 | in the input. 22 | """ 23 | 24 | def __init__(self, axis=-1): 25 | if isinstance(axis, (list, tuple)): 26 | self.axis = list(axis) 27 | elif isinstance(axis, int): 28 | self.axis = axis 29 | else: 30 | raise TypeError( 31 | "Invalid value for `axis` argument: " 32 | "expected an int or a list/tuple of ints. " 33 | f"Received: axis={axis}" 34 | ) 35 | 36 | 37 | def __call__(self, inputs): 38 | return tf.linalg.l2_normalize(inputs, axis=self.axis) -------------------------------------------------------------------------------- /Note/nn/layer/up_sampling1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class up_sampling1d: 4 | def __init__(self, size): 5 | self.size = size 6 | 7 | def __call__(self, inputs): 8 | # Repeat each time step size times along the temporal axis 9 | outputs = tf.repeat(inputs, self.size, axis=1) 10 | return outputs -------------------------------------------------------------------------------- /Note/nn/layer/up_sampling2d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class up_sampling2d: 4 | def __init__(self, size): 5 | # Convert size to a tuple if it is an integer 6 | if isinstance(size, int): 7 | size = (size, size) 8 | self.size = size 9 | 10 | def __call__(self, inputs): 11 | # Repeat each spatial dimension size times along the height and width axes 12 | outputs = tf.repeat(inputs, self.size[0], axis=1) 13 | outputs = tf.repeat(outputs, self.size[1], axis=2) 14 | return outputs -------------------------------------------------------------------------------- /Note/nn/layer/up_sampling3d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class up_sampling3d: 4 | def __init__(self, size): 5 | # Convert size to a tuple if it is an integer 6 | if isinstance(size, int): 7 | size = (size, size, size) 8 | self.size = size 9 | 10 | def __call__(self, inputs): 11 | # Repeat each spatial dimension size times along the depth, height and width axes 12 | outputs = tf.repeat(inputs, self.size[0], axis=1) 13 | outputs = tf.repeat(outputs, self.size[1], axis=2) 14 | outputs = tf.repeat(outputs, self.size[2], axis=3) 15 | return outputs -------------------------------------------------------------------------------- /Note/nn/layer/vector_quantizer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.initializer import initializer 3 | 4 | class vector_quantizer: 5 | def __init__( 6 | self, 7 | embedding_dim: int, 8 | num_embeddings: int, 9 | commitment_cost: float, 10 | dtype = 'float32', 11 | ): 12 | self.embedding_dim = embedding_dim 13 | self.num_embeddings = num_embeddings 14 | self.commitment_cost = commitment_cost 15 | 16 | self._embedding_shape = [embedding_dim, num_embeddings] 17 | self._embedding_dtype = dtype 18 | self.embeddings = initializer(self._embedding_shape, 19 | ['VarianceScaling',1.0,'fan_in','uniform'], 20 | dtype) 21 | 22 | def __call__(self, data, is_training): 23 | flat_inputs = tf.reshape(data, [-1, self.embedding_dim]) 24 | 25 | distances = ( 26 | tf.math.reduce_sum(tf.math.square(flat_inputs), 1, keepdims=True) - 27 | 2 * tf.matmul(flat_inputs, self.embeddings) + 28 | tf.math.reduce_sum(tf.math.square(self.embeddings), 0, keepdims=True)) 29 | 30 | encoding_indices = tf.math.argmax(-distances, 1) 31 | encodings = tf.one_hot(encoding_indices, 32 | self.num_embeddings, 33 | dtype=distances.dtype) 34 | 35 | encoding_indices = tf.reshape(encoding_indices, data.shape[:-1]) 36 | quantized = self.quantize(encoding_indices) 37 | 38 | e_latent_loss = tf.math.reduce_mean( 39 | tf.math.square(tf.stop_gradient(quantized) - data)) 40 | q_latent_loss = tf.math.reduce_mean( 41 | tf.math.square(quantized - tf.stop_gradient(data))) 42 | loss = q_latent_loss + self.commitment_cost * e_latent_loss 43 | 44 | quantized = data + tf.stop_gradient(quantized - data) 45 | avg_probs = tf.math.reduce_mean(encodings, 0) 46 | perplexity = tf.math.exp(-tf.math.reduce_sum(avg_probs * tf.math.log(avg_probs + 1e-10))) 47 | 48 | return { 49 | "quantize": quantized, 50 | "loss": loss, 51 | "perplexity": perplexity, 52 | "encodings": encodings, 53 | "encoding_indices": encoding_indices, 54 | "distances": distances, 55 | } 56 | 57 | def quantize(self, encoding_indices): 58 | w = tf.transpose(self.embeddings, [1, 0]) 59 | return w[(encoding_indices,)] -------------------------------------------------------------------------------- /Note/nn/layer/zeropadding1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def normalize_tuple(value, n, allow_zero=False): 5 | error_msg = ( 6 | f"integers. Received: {value}" 7 | ) 8 | 9 | if isinstance(value, int): 10 | value_tuple = (value,) * n 11 | else: 12 | try: 13 | value_tuple = tuple(value) 14 | except TypeError: 15 | raise ValueError(error_msg) 16 | if len(value_tuple) != n: 17 | raise ValueError(error_msg) 18 | for single_value in value_tuple: 19 | try: 20 | int(single_value) 21 | except (ValueError, TypeError): 22 | error_msg += ( 23 | f"including element {single_value} of " 24 | f"type {type(single_value)}" 25 | ) 26 | raise ValueError(error_msg) 27 | 28 | if allow_zero: 29 | unqualified_values = {v for v in value_tuple if v < 0} 30 | req_msg = ">= 0" 31 | else: 32 | unqualified_values = {v for v in value_tuple if v <= 0} 33 | req_msg = "> 0" 34 | 35 | if unqualified_values: 36 | error_msg += ( 37 | f" including {unqualified_values}" 38 | f" that does not satisfy the requirement `{req_msg}`." 39 | ) 40 | raise ValueError(error_msg) 41 | 42 | return value_tuple 43 | 44 | 45 | class zeropadding1d: 46 | def __init__(self,input_size=None, padding=None): 47 | self.pattern = None 48 | if padding is not None: 49 | padding = normalize_tuple( 50 | padding, 2, allow_zero=True 51 | ) 52 | self.pattern = [[0, 0], [padding[0], padding[1]], [0, 0]] 53 | self.input_size=input_size 54 | if input_size!=None: 55 | self.output_size=input_size 56 | 57 | 58 | def __call__(self, data, padding=1): 59 | if self.pattern is None: 60 | padding = normalize_tuple( 61 | padding, 2, allow_zero=True 62 | ) 63 | pattern = [[0, 0], [padding[0], padding[1]], [0, 0]] 64 | else: 65 | pattern = self.pattern 66 | return tf.pad(data, pattern) -------------------------------------------------------------------------------- /Note/nn/nan_to_num.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def nan_to_num(tensor, nan=0.0, out=None): 5 | result = tf.where(tf.math.is_nan(tensor), tf.constant(nan, dtype=tensor.dtype), tensor) 6 | if out is not None: 7 | out.assign(result) 8 | return out 9 | return result -------------------------------------------------------------------------------- /Note/nn/narrow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def narrow(tensor, dim, start, size): 5 | rank = tf.rank(tensor) 6 | shape = tf.shape(tensor) 7 | dim = tf.where(dim < 0, dim + rank, dim) 8 | before = tf.zeros([dim], dtype=tf.int32) 9 | after = tf.zeros([rank - dim - 1], dtype=tf.int32) 10 | begin = tf.concat([before, tf.expand_dims(start, 0), after], axis=0) 11 | one_hot = tf.one_hot(dim, rank, dtype=tf.int32) 12 | size_for_tf_slice = (shape * (1 - one_hot)) + size * one_hot 13 | return tf.slice(tensor, begin, size_for_tf_slice) 14 | -------------------------------------------------------------------------------- /Note/nn/optimizer/optimizer.py: -------------------------------------------------------------------------------- 1 | from keras.src import backend 2 | from keras.src.api_export import keras_export 3 | from Note.nn.optimizers import base_optimizer 4 | 5 | if backend.backend() == "tensorflow": 6 | from keras.src.backend.tensorflow.optimizer import ( 7 | TFOptimizer as BackendOptimizer, 8 | ) 9 | elif backend.backend() == "torch": 10 | from keras.src.backend.torch.optimizers import ( 11 | TorchOptimizer as BackendOptimizer, 12 | ) 13 | elif backend.backend() == "jax": 14 | from keras.src.backend.jax.optimizer import JaxOptimizer as BackendOptimizer 15 | else: 16 | 17 | class BackendOptimizer(base_optimizer.BaseOptimizer): 18 | pass 19 | 20 | 21 | @keras_export(["keras.Optimizer", "keras.optimizers.Optimizer"]) 22 | class Optimizer(BackendOptimizer, base_optimizer.BaseOptimizer): 23 | pass 24 | 25 | 26 | Optimizer.__doc__ = base_optimizer.BaseOptimizer.__doc__ 27 | base_optimizer_keyword_args = base_optimizer.base_optimizer_keyword_args -------------------------------------------------------------------------------- /Note/nn/optimizer/orthograd.py: -------------------------------------------------------------------------------- 1 | """ OrthoGrad 2 | https://arxiv.org/abs/2501.04697 3 | 4 | Copyright 2025 NoteDance 5 | """ 6 | import tensorflow as tf 7 | from keras.src.optimizers import optimizer 8 | 9 | 10 | class OrthoGrad(optimizer.Optimizer): 11 | def __init__( 12 | self, 13 | base_optimizer=None, 14 | name="orthograd", 15 | ): 16 | super().__init__( 17 | learning_rate=1., 18 | name=name, 19 | ) 20 | self.base_optimizer = base_optimizer 21 | 22 | def reset(self): 23 | pass 24 | 25 | def build(self, var_list): 26 | if self.built: 27 | return 28 | super().build(var_list) 29 | 30 | @staticmethod 31 | def _orthogonalize_gradients(self, params, grads): 32 | """ 33 | Projects the gradient g to be orthogonal to the current weights w. 34 | 35 | g_orth = g - ( (w·g)/(w·w + eps) ) * w 36 | 37 | And then re-scales g_orth to have the same norm as g. 38 | """ 39 | for p, g in zip(params, grads): 40 | w = tf.reshape(p, [-1]) 41 | g = tf.reshape(g, [-1]) 42 | 43 | w_norm_sq = tf.tensordot(w, w, axes=1) + 1e-30 44 | proj = tf.tensordot(w, g, axes=1) / w_norm_sq 45 | g_orth = g - proj * w 46 | 47 | g_norm = tf.norm(g, ord=2) 48 | g_orth_norm = tf.norm(g_orth, ord=2) + 1e-30 49 | g_orth_scaled = g_orth * (g_norm / g_orth_norm) 50 | 51 | grads[self._get_variable_index(p)] = tf.reshape(g_orth_scaled, g.shape) 52 | 53 | def _backend_update_step(self, grads, trainable_variables, learning_rate): 54 | """Collective update_step that can be overridden by the backend. 55 | 56 | It is overridden by torch for performance reasons, and 57 | by TF to support tf.distribute. 58 | """ 59 | self.update_step(grads, trainable_variables, learning_rate) 60 | 61 | def apply_gradients(self, grads_and_vars, tape=None): 62 | self.tape = tape 63 | grads, trainable_variables = zip(*grads_and_vars) 64 | self.apply(grads, trainable_variables) 65 | # Return iterations for compat with tf.keras. 66 | return self._iterations 67 | 68 | def update_step(self, grads, trainable_variables, learning_rate): 69 | self._orthogonalize_gradients(trainable_variables, grads) 70 | if self.tape is None: 71 | self.base_optimizer.apply_gradients(zip(grads, trainable_variables)) 72 | else: 73 | self.base_optimizer.apply_gradients(zip(grads, trainable_variables), self.tape) 74 | 75 | def get_config(self): 76 | config = super().get_config() 77 | config.update( 78 | { 79 | "base_optimizer": self.base_optimizer, 80 | } 81 | ) 82 | return config 83 | 84 | def _apply_weight_decay(self, variables): 85 | pass -------------------------------------------------------------------------------- /Note/nn/pairwise_distance.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def pairwise_distance(x, y, p=2, eps=1e-6, keepdim=False): 4 | diff = tf.math.subtract(x, y) + eps 5 | norm = tf.math.reduce_sum(tf.math.abs(diff ** p), axis=-1) ** (1/p) 6 | if keepdim==True: 7 | norm = tf.expand_dims(norm, -1) 8 | return norm -------------------------------------------------------------------------------- /Note/nn/parallel/assign_device.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def assign_device(p, device): # a function to assign device according to the process index p and the device type 5 | devices = tf.config.list_physical_devices(device) # get a list of available devices of the given type 6 | if devices: # if there are any devices of the given type 7 | try: 8 | tf.config.set_visible_devices(devices[p % len(devices)], device) # set the device with index p modulo the number of devices as visible 9 | device = '/' + device + ':' + str(p % len(devices)) # store the device name as an attribute 10 | except RuntimeError as e: # catch any runtime error 11 | raise e # raise the error message 12 | else: # if there are no devices of the given type 13 | device = '/CPU:0' # use CPU device as default 14 | return device 15 | -------------------------------------------------------------------------------- /Note/nn/parallel/assign_device_pytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def assign_device(p, device): # a function to assign device according to the process index p and the device type 5 | if device == 'GPU': # if the device type is GPU 6 | if torch.cuda.is_available(): # if there are any available GPU devices 7 | try: 8 | torch.cuda.set_device(p % torch.cuda.device_count()) # set the device with index p modulo the number of devices as current 9 | device = torch.device('cuda', p % torch.cuda.device_count()) # create a torch.device object with the current device 10 | except RuntimeError as e: # catch any runtime error 11 | raise e # raise the error message 12 | else: # if there are no available GPU devices 13 | device = torch.device('cpu') # use CPU device as default 14 | elif device == 'CPU': # if the device type is CPU 15 | device = torch.device('cpu') # use CPU device as default 16 | else: # if the device type is neither GPU nor CPU 17 | raise ValueError('Invalid device type') # raise a value error 18 | return device 19 | -------------------------------------------------------------------------------- /Note/nn/parameter.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from Note.nn.Model import Model 3 | 4 | def Parameter(data,trainable=True,name=None): 5 | param=tf.Variable(data,trainable=trainable) 6 | if name!=None: 7 | param=tf.Variable(param,name=name) 8 | Model.param.append(param) 9 | if Model.name!=None and Model.name not in Model.layer_param: 10 | Model.layer_param[Model.name]=[] 11 | Model.layer_param[Model.name].append(param) 12 | elif Model.name_!=None: 13 | Model.layer_param[Model.name].append(param) 14 | return param 15 | -------------------------------------------------------------------------------- /Note/nn/pos_embed.py: -------------------------------------------------------------------------------- 1 | """ Position Embedding Utilities 2 | 3 | Hacked together by / Copyright 2024 NoteDance 4 | """ 5 | import logging 6 | import math 7 | from typing import List, Optional 8 | 9 | import tensorflow as tf 10 | from Note import nn 11 | 12 | _logger = logging.getLogger(__name__) 13 | 14 | 15 | def resample_abs_pos_embed( 16 | posemb, 17 | new_size: List[int], 18 | old_size: Optional[List[int]] = None, 19 | num_prefix_tokens: int = 1, 20 | interpolation: str = 'bicubic', 21 | antialias: bool = True, 22 | verbose: bool = False, 23 | ): 24 | # sort out sizes, assume square if old size not provided 25 | num_pos_tokens = posemb.shape[1] 26 | num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens 27 | if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]: 28 | return posemb 29 | 30 | if old_size is None: 31 | hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens)) 32 | old_size = hw, hw 33 | 34 | if num_prefix_tokens: 35 | posemb_prefix, posemb = posemb[:, :num_prefix_tokens], posemb[:, num_prefix_tokens:] 36 | else: 37 | posemb_prefix, posemb = None, posemb 38 | 39 | # do the interpolation 40 | embed_dim = posemb.shape[-1] 41 | orig_dtype = posemb.dtype 42 | posemb = tf.cast(posemb, 'float32') # interpolate needs float32 43 | posemb = tf.reshape(posemb, (1, old_size[0], old_size[1], -1)) 44 | posemb = nn.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias) 45 | posemb = tf.reshape(posemb, (1, -1, embed_dim)) 46 | posemb = tf.cast(posemb, orig_dtype) 47 | 48 | # add back extra (class, etc) prefix tokens 49 | if posemb_prefix is not None: 50 | posemb = tf.concat([posemb_prefix, posemb], axis=1) 51 | 52 | if verbose: 53 | _logger.info(f'Resized position embedding: {old_size} to {new_size}.') 54 | 55 | return posemb 56 | 57 | 58 | def resample_abs_pos_embed_nhwc( 59 | posemb, 60 | new_size: List[int], 61 | interpolation: str = 'bicubic', 62 | antialias: bool = True, 63 | verbose: bool = False, 64 | ): 65 | if new_size[0] == posemb.shape[-3] and new_size[1] == posemb.shape[-2]: 66 | return posemb 67 | 68 | orig_dtype = posemb.dtype 69 | posemb = tf.cast(posemb, 'float32') 70 | posemb = nn.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias) 71 | posemb = tf.cast(posemb, orig_dtype) 72 | 73 | if verbose: 74 | _logger.info(f'Resized position embedding: {posemb.shape[-3:-1]} to {new_size}.') 75 | 76 | return posemb -------------------------------------------------------------------------------- /Note/nn/positional_encoding.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def positional_encoding(max_len,d_model): 5 | pos_enc=tf.zeros((max_len,d_model)) 6 | angles=tf.zeros((max_len,d_model)) 7 | pos=tf.range(max_len)[:,tf.newaxis] 8 | i=tf.range(d_model)[tf.newaxis,:] 9 | even_mask=i%2==0 10 | odd_mask=~even_mask 11 | angles=tf.where(even_mask,tf.math.sin(pos/(10000**(i/d_model))),angles) 12 | angles=tf.where(odd_mask,tf.math.cos(pos/(10000**((i-1)/d_model))),angles) 13 | pos_enc=angles 14 | return pos_enc -------------------------------------------------------------------------------- /Note/nn/restore.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import pickle 3 | 4 | 5 | def restore(path): 6 | input_file=open(path,'rb') 7 | model=pickle.load(input_file) 8 | optimizer=tf.keras.optimizers.deserialize(pickle.load(input_file)) 9 | input_file.close() 10 | return model,optimizer 11 | 12 | 13 | def restore_param(path): 14 | input_file=open(path,'rb') 15 | param=pickle.load(input_file) 16 | input_file.close() 17 | return param 18 | -------------------------------------------------------------------------------- /Note/nn/scaled_dot_product_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import math 3 | 4 | def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None): 5 | L, S = query.shape[-2], key.shape[-2] 6 | scale_factor = 1 / math.sqrt(query.shape[-1]) if scale is None else scale 7 | attn_bias = tf.zeros((L, S), dtype=query.dtype) 8 | if is_causal: 9 | assert attn_mask is None 10 | temp_mask = tf.linalg.band_part(tf.ones((L, S), dtype=tf.bool), -1, 0) 11 | attn_bias = tf.where(temp_mask, attn_bias, float("-inf")) 12 | attn_bias = tf.cast(attn_bias, query.dtype) 13 | 14 | if attn_mask is not None: 15 | if attn_mask.dtype == tf.bool: 16 | attn_bias = tf.where(attn_mask, attn_bias, float("-inf")) 17 | else: 18 | attn_bias += attn_mask 19 | attn_weight = tf.matmul(query, tf.transpose(key, (0, 1, 3, 2))) * scale_factor 20 | attn_weight += attn_bias 21 | attn_weight = tf.nn.softmax(attn_weight, axis=-1) 22 | attn_weight = tf.nn.dropout(attn_weight, dropout_p) 23 | return tf.matmul(attn_weight, value) 24 | -------------------------------------------------------------------------------- /Note/nn/softplus.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def softplus(x, beta=1.0, threshold=20.0): 5 | if beta != 1.0: 6 | x = x * beta 7 | x = tf.where( 8 | x > threshold, 9 | x, 10 | tf.math.log(1 + tf.exp(x)) 11 | ) 12 | if beta != 1.0: 13 | x = x / beta 14 | return x -------------------------------------------------------------------------------- /Note/nn/solve_triangular.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def solve_triangular(A, B, *, upper, left=True, unitriangular=False): 5 | if unitriangular: 6 | diag_shape = tf.shape(tf.linalg.diag_part(A)) 7 | ones = tf.ones(diag_shape, dtype=A.dtype) 8 | A = tf.linalg.set_diag(A, ones) 9 | 10 | if left: 11 | X = tf.linalg.triangular_solve(A, B, lower=not upper) 12 | else: 13 | X_T = tf.linalg.triangular_solve(tf.transpose(A), tf.transpose(B), lower=upper) 14 | X = tf.transpose(X_T) 15 | return X -------------------------------------------------------------------------------- /Note/nn/sparse_mask.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def sparse_mask(dense_tensor, mask_sparse): 5 | indices = mask_sparse.indices # [N, ndims] 6 | values = tf.gather_nd(dense_tensor, indices) 7 | return tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=mask_sparse.dense_shape) -------------------------------------------------------------------------------- /Note/sr.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | 4 | def save(data,path): 5 | output_file=open(path,'wb') 6 | pickle.dump(data,output_file) 7 | output_file.close() 8 | return 9 | 10 | 11 | def restore(path): 12 | input_file=open(path,'rb') 13 | data=pickle.load(input_file) 14 | input_file.close() 15 | return data 16 | -------------------------------------------------------------------------------- /Note/version.py: -------------------------------------------------------------------------------- 1 | version='7.0' 2 | date='2023.7.7' 3 | --------------------------------------------------------------------------------