├── .gitignore ├── .pre-commit-config.yaml ├── .style.yapf ├── Fluid ├── developer's_guide_for_Fluid │ ├── Developer's_Guide_to_Paddle_Fluid.md │ ├── Developer's_Guide_to_Paddle_Fluid.pdf │ └── images │ │ ├── 1.png │ │ ├── 2.png │ │ ├── 3.png │ │ ├── 4.png │ │ ├── LoDTensor.png │ │ ├── compile_run_time.png │ │ ├── executor.png │ │ ├── fluid-compiler.png │ │ ├── fluid_examples.png │ │ ├── fluid_module_1.png │ │ ├── fluid_module_2.png │ │ ├── graph_construction_example_all.png │ │ ├── layer.png │ │ ├── operator1.png │ │ ├── operator2.png │ │ ├── place.png │ │ ├── print_fluid_program.png │ │ ├── program_desc1.png │ │ ├── program_desc2.png │ │ ├── raw_input.png │ │ ├── scope_variable_tensor.png │ │ ├── sorted_input.png │ │ ├── transpiler.png │ │ └── user_interface.png └── nmt_on_fluid │ ├── NMT on fluid.md │ └── images │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── attention.png │ ├── raw_input.png │ ├── sorted_input.png │ └── user_interface.png ├── README.md ├── TeXNotes ├── .gitignore ├── 00_templates │ ├── assignment │ │ ├── contents.tex │ │ ├── figures │ │ │ └── figure1.png │ │ ├── main.tex │ │ ├── references.bib │ │ └── structure.tex │ └── slides │ │ ├── slides.tex │ │ └── structure.tex ├── Efficient_attention_and_RNNs │ ├── contents │ │ ├── linear_rnn.tex │ │ ├── loop_bounds.tex │ │ ├── miscellany.tex │ │ ├── parallel_rnn.tex │ │ ├── scan.tex │ │ ├── ssm.tex │ │ └── stacked_rnns.tex │ ├── define_language.tex │ ├── figures │ │ ├── SSM-overview.pdf │ │ ├── attention-train.pdf │ │ ├── attention.pdf │ │ ├── cond_branchs.pdf │ │ ├── figures.pptx │ │ ├── mamba-mixer.pdf │ │ ├── mamba-model.pdf │ │ ├── mamba-ssm.pdf │ │ ├── rnn_layer1.pdf │ │ ├── rnn_layer2.pdf │ │ ├── scan_step.pdf │ │ ├── signal_flow_structure_of_stacked_rnn.pdf │ │ ├── stacked_rnns1.pdf │ │ └── stacked_rnns2.pdf │ ├── main.pdf │ ├── main.tex │ ├── references.bib │ └── structure.tex ├── Flash_Attention │ ├── contents │ │ ├── CTA_offset.tex │ │ ├── IO_complexity.tex │ │ ├── flash_attention.tex │ │ ├── fuse_reduce.tex │ │ └── online_softmax.tex │ ├── figures │ │ ├── attention_offset-a.pdf │ │ ├── attention_offset-b.pdf │ │ ├── fuse_reduce.pdf │ │ └── logsoftmax_expression_tree.pdf │ ├── main.pdf │ ├── main.tex │ ├── references.bib │ └── structure.tex ├── Formalize_Flash_Attention │ ├── FlashAttention_formalization.pdf │ ├── contents │ │ ├── Background.tex │ │ ├── backward.tex │ │ ├── beyond_flash_attention.tex │ │ ├── fused_chained_map_and_then_aggregate.tex │ │ ├── map_and_then_aggreage.tex │ │ └── welford_algorithm.tex │ ├── figures │ │ ├── Transformer-block.pdf │ │ ├── Transformer-block.png │ │ ├── attention.pptx │ │ ├── attention_expression_tree1.pdf │ │ ├── attention_expression_tree2.pdf │ │ ├── attention_expression_tree3.pdf │ │ ├── fused_mha.pdf │ │ ├── fused_transformer_block.pdf │ │ ├── map_and_aggregate.pdf │ │ ├── multi-scale-attn1.pdf │ │ ├── multi-scale-attn2.pdf │ │ └── transformer.png │ ├── main.pdf │ ├── main.tex │ ├── references.bib │ └── structure.tex ├── LLM_inference │ ├── contents │ │ ├── background.tex │ │ ├── bytetransformer.tex │ │ ├── deepspeed-inference.tex │ │ └── flexgen.tex │ ├── images │ │ ├── BERT-performance-breakdown.pdf │ │ ├── ILP.png │ │ ├── LLM-inference-graph.png │ │ ├── LLM.xlsx │ │ ├── MHA-variable-seq-length.pdf │ │ ├── SBI-GEMM.pdf │ │ ├── block-schedule-algorithm.png │ │ ├── block-schedule.pdf │ │ ├── byte-transformer-overview2.pdf │ │ ├── bytetransformer-padding-free-input-batch.pdf │ │ ├── bytetransformer_overview.pdf │ │ ├── different-batch-size-in-pipeline-parallelism.png │ │ ├── fused_transformer_block-deepspeed-inference.pdf │ │ ├── grouped-gemm.pdf │ │ ├── grouped-mha.pdf │ │ ├── llm-inference.pdf │ │ ├── llm-inference.pptx │ │ ├── logo.jpeg │ │ ├── memory-system.pdf │ │ ├── pipeline-schedule-deep-speed-inference.png │ │ ├── two-stages-in-llm-inference.pdf │ │ ├── variable-length-mha.pdf │ │ └── zero-padding-algorithm.pdf │ ├── llm_inference.pdf │ ├── llm_inference.tex │ └── references.bib └── Parallel_Execution_of_DO_Loops │ ├── basics.tex │ ├── example.tex │ ├── hyperplane.tex │ ├── hypertheorem.tex │ ├── main.pdf │ ├── main.tex │ ├── optimalpi.tex │ ├── references.bib │ └── structure.tex ├── code_reading ├── README.md └── pet │ ├── README.md │ ├── basics.md │ ├── clang_pulgin.md │ ├── extract_scop.md │ └── tree2scop.md ├── engineering_a_compiler └── scanner.md ├── julia_learning_notes ├── Basics │ ├── 01_JPL_FQ.md │ ├── 02_JPL_gpu_computing.md │ ├── 03_JPL_metaprogramming.md │ ├── Generated_function.ipynb │ ├── Good_references.md │ ├── Julia_ASTs.ipynb │ ├── Macro.ipynb │ ├── Tips.ipynb │ ├── Type_inference │ │ ├── README.md │ │ └── Type_inference.pdf │ ├── Types.ipynb │ ├── WhyJulia │ │ ├── README.md │ │ └── whyJulia.pdf │ ├── broadcast.ipynb │ └── images │ │ └── gpu_julia_packages.png ├── CodeSnippets │ ├── UnionAll_types.ipynb │ ├── assignment_and_mutation.ipynb │ ├── infer_type_parameter.ipynb │ ├── modules.ipynb │ ├── parameteric_types.ipynb │ └── scope.ipynb ├── Flux │ ├── Flux_implementation.md │ ├── Test_user_interface.ipynb │ └── char_rnn_example.ipynb ├── IRTools │ ├── Meta.ipynb │ └── _methods_by_ftype.ipynb ├── README.md ├── Zygote │ ├── CodesStudy1.ipynb │ ├── CodesStudy2.ipynb │ └── code_snippets │ │ ├── hello_world_test.jl │ │ └── program_representation_in_Julia.ipynb └── experiments │ ├── README.md │ └── lstm_test │ ├── README.md │ ├── naive_cpu_test │ ├── README.md │ ├── Recurrent │ │ ├── common.jl │ │ ├── lstm.jl │ │ └── recurrent.jl │ └── cpu_test.jl │ └── naive_gpu_test │ ├── Recurrent │ ├── lstm.jl │ └── recurrent.jl │ └── gpu_test.jl ├── notes_for_tf_pt ├── compile-pt.md └── compile-tf.md ├── paper_notes ├── Diffusion │ └── README.md ├── README.md ├── Template.md ├── array-programming-model │ └── README.md ├── auto-diff │ ├── BP_and_implicit_function_theorem │ │ └── README.md │ ├── Dynamic_Automatic_Differentiation_of_GPU_Broadcast_Kernels.md │ ├── SCT_AD │ │ ├── Differentiating_SSA_form_program.md │ │ └── README.md │ ├── brief_introduction_to_AD.md │ ├── images │ │ ├── expression_graph.png │ │ ├── mix_forward_and_reverse_mode_AD.png │ │ └── multidimensional_dual_numbers.png │ └── tape_based_ad.md ├── compiler-stuffs │ ├── CFG-optimizations │ │ ├── Control-Flow-Optimization.md │ │ ├── README.md │ │ └── images │ │ │ └── discussion-on-cfg.png │ ├── Code-optimizations │ │ ├── dmxpy-optimizations.ipynb │ │ ├── dmxpy.py │ │ ├── images │ │ │ ├── dmxpy.png │ │ │ ├── dmxpy.pptx │ │ │ ├── excerpt-from-dmxpy-in-linpack.png │ │ │ ├── manual-optimizations-for-dmxpy.png │ │ │ ├── simple-version-of-dmxpy.png │ │ │ └── strength-reduction.png │ │ ├── introduction-to-optimizations.md │ │ └── local-optimizatioins.md │ ├── Compiler_and_Interpreter.pdf │ ├── Dependence_analysis │ │ ├── README.md │ │ └── dependence_abstraction │ │ │ ├── contents.tex │ │ │ ├── main.tex │ │ │ ├── references.bib │ │ │ └── structure.tex │ ├── Intermediate-Representations │ │ ├── IR.md │ │ └── images │ │ │ ├── AST-example.png │ │ │ ├── DAG-example.png │ │ │ ├── IR-level-of-abstraction.png │ │ │ ├── an-simple-example-of-dependency-graph.png │ │ │ ├── dependence-graph.png │ │ │ ├── different-levels-of-abstraction-for-an-array-subscript-reference.png │ │ │ ├── naming-leads-to-different-translations.png │ │ │ ├── one-address-code.png │ │ │ └── three-address-code.png │ ├── Loop_analysis │ │ ├── Denpendence_analysis │ │ │ ├── Basics │ │ │ │ ├── README.md │ │ │ │ ├── Terminology.md │ │ │ │ └── images │ │ │ │ │ ├── a_single_rnn.png │ │ │ │ │ ├── bi_directional_rnn.png │ │ │ │ │ ├── bi_directional_rnn.pptx │ │ │ │ │ ├── different_dependences.png │ │ │ │ │ ├── dilated_rnn.png │ │ │ │ │ ├── example-01.png │ │ │ │ │ ├── grid_rnn.png │ │ │ │ │ ├── polyhedral_representation.png │ │ │ │ │ ├── stack_rnn.png │ │ │ │ │ ├── wh.png │ │ │ │ │ └── workflow.png │ │ │ └── dependence_analysis.pdf │ │ ├── Polyhedral_representation_in_Pet │ │ │ ├── references.bib │ │ │ ├── section1.tex │ │ │ ├── slides.tex │ │ │ └── structure.tex │ │ ├── The_parallel_execution_of_do_loops.pdf │ │ └── auto-vectorization │ │ │ ├── README.md │ │ │ └── auto-vectorizing-TensorFlow-graphs.md │ ├── Polyhedral_compilatioin │ │ ├── A_Performance_Vocabulary_for_Affine_Loop_Transformations │ │ │ ├── contents.tex │ │ │ ├── main.tex │ │ │ ├── references.bib │ │ │ └── structure.tex │ │ ├── A_decopled_approach_to_high-level_loop_optimization.md │ │ ├── A_decoupled_approach_to_high-level_loop_optimization │ │ │ └── mathematical_foundations_of_polyhedra.md │ │ ├── Array_regions_analyses_and_applications │ │ │ └── Array_regions_analyses_and_applications.md │ │ ├── Data_dependence_and_PIP │ │ │ ├── contents.tex │ │ │ ├── main.tex │ │ │ ├── references.bib │ │ │ └── structure.tex │ │ ├── Mathmatical_foundations │ │ │ └── concepts.md │ │ ├── More_Legal_Transformations_for_Locality │ │ │ ├── contents.tex │ │ │ ├── images │ │ │ │ ├── SCoP_decomposition.png │ │ │ │ ├── a_skewing_transformation.png │ │ │ │ └── farkas_lemma.png │ │ │ ├── main.pdf │ │ │ ├── main.tex │ │ │ ├── references.bib │ │ │ └── structure.tex │ │ ├── Polyhedral_representation │ │ │ ├── 2d_puls_1.tex │ │ │ ├── main.tex │ │ │ ├── references.bib │ │ │ ├── schedule_tree.tex │ │ │ └── structure.tex │ │ ├── README.md │ │ ├── Tiramisu │ │ │ ├── README.md │ │ │ └── tiramisu_overview.png │ │ ├── images │ │ │ ├── introduction-01.png │ │ │ ├── introduction-02.png │ │ │ ├── introduction-03.png │ │ │ └── introduction-04.png │ │ ├── isl │ │ │ ├── contents.tex │ │ │ ├── main.pdf │ │ │ ├── main.tex │ │ │ ├── presburger_sets_and_relations.tex │ │ │ ├── pw_quasi_affine.tex │ │ │ ├── references.bib │ │ │ ├── sets_and_maps.tex │ │ │ └── structure.tex │ │ ├── mathematical_foundations_of_polyhedra.md │ │ ├── polyhedral_background_01.pdf │ │ ├── polyhedral_background_02.pdf │ │ └── polyhedral_background_03.pdf │ ├── README.md │ └── intermediate-code-generation.md ├── data_processing_systems │ ├── CIEL.md │ ├── README.md │ └── figures │ │ ├── ciel_cluster_architecture.png │ │ └── dynamic-task-graph.png ├── dataflow-architectures │ ├── Advances_in_dataflow_programming_languages.md │ ├── Dataflow_computers_their_history_and_future.md │ ├── README.md │ ├── Scheduled_dataflow.md │ ├── images │ │ ├── Manchester-dynamic-dataflow-machine.png │ │ ├── dataflow-accumulator.png │ │ ├── dataflow-graph-1.png │ │ ├── ill-formed_multi-rate-dataflow.png │ │ ├── img1.png │ │ ├── img2.png │ │ ├── paper-screenshot-1.png │ │ ├── periodic_admissible_schedule.png │ │ ├── solve_G.q.png │ │ ├── static-dataflow-architecture.png │ │ ├── topology_matrix.png │ │ └── two-input_add_actor_and_two-output_duplicate_actor.png │ └── synchronous_dataflow.md ├── dl-compiler │ ├── Glow │ │ ├── Glow.md │ │ ├── Glow.pdf │ │ └── images │ │ │ └── low-level-glow-ir.png │ ├── MLIR │ │ ├── MLIR.md │ │ ├── README.md │ │ ├── swift_for_tensorflow.md │ │ └── swift_for_tensorflow.pdf │ ├── README.md │ ├── TVM │ │ ├── Relay.md │ │ ├── TVM.md │ │ └── TVM.pdf │ ├── XLA │ │ └── XLA.md │ ├── figures │ │ └── sm-and-sub-core-of-volta.png │ └── fusion │ │ └── README.md ├── dl-models │ ├── attention_simplification │ │ └── README.md │ ├── beyond_transformer │ │ └── README.md │ ├── miscellanea │ │ ├── Geometric_deep_learning.md │ │ └── README.md │ ├── nlp │ │ ├── RNN-modeling │ │ │ ├── CW-RNN │ │ │ │ ├── A_Clockwork_RNN.md │ │ │ │ └── A_Clockwork_RNN.pdf │ │ │ ├── GridLSTM │ │ │ │ ├── GridLSTM.md │ │ │ │ └── GridLSTM.pdf │ │ │ ├── HM-LSTM │ │ │ │ ├── Hierarchical_multiscale_RNN.md │ │ │ │ └── Hierarchical_multiscale_RNN.pdf │ │ │ ├── How_Much_Attention_Do_You_Need.md │ │ │ ├── MD-LSTM │ │ │ │ ├── MD-LSTM.md │ │ │ │ └── MD-LSTM.pdf │ │ │ ├── Mogrifier-LSTM │ │ │ │ ├── contents.tex │ │ │ │ ├── images │ │ │ │ │ └── MogrifierLSTM.png │ │ │ │ ├── main.pdf │ │ │ │ ├── main.tex │ │ │ │ ├── references.bib │ │ │ │ └── structure.tex │ │ │ ├── Neural_Speed_Reading_via_Skim_RNN │ │ │ │ └── Neural_Speed_Reading_via_Skim_RNN.md │ │ │ ├── ON-LSTM │ │ │ │ ├── ON-LSTM.md │ │ │ │ └── ON-LSTM.pdf │ │ │ ├── Quasi-Recurrent_neural_network │ │ │ │ ├── Quasi-Recurrent_neural_network.md │ │ │ │ └── Quasi-Recurrent_neural_network.pdf │ │ │ ├── README.md │ │ │ ├── RNN_Variants_Slides_190820 │ │ │ │ ├── RNN_Variants.md │ │ │ │ ├── RNN_Variants.pdf │ │ │ │ └── images │ │ │ │ │ ├── CudnnLSTM.png │ │ │ │ │ └── Recurrent_neural_network_unfold.svg.png │ │ │ ├── Sliced_Recurrent_Neural_Networks │ │ │ │ ├── Sliced_Recurrent_Neural_Networks.md │ │ │ │ └── images │ │ │ │ │ └── SRNN.png │ │ │ ├── The_Unreasonable_Effectiveness_of_the_Forget_Gate │ │ │ │ ├── The_Unreasonable_Effectiveness_of_the_Forget_Gate.md │ │ │ │ └── The_Unreasonable_Effectiveness_of_the_Forget_Gate.pdf │ │ │ ├── Training_RNNs_as_Fast_as_CNNs │ │ │ │ ├── Training_RNNs_as_Fast_as_CNNs.md │ │ │ │ └── Training_RNNs_as_Fast_as_CNNs.pdf │ │ │ ├── Transformer │ │ │ │ ├── README.md │ │ │ │ ├── README.pdf │ │ │ │ └── images │ │ │ │ │ └── QK.png │ │ │ ├── WaveRNN.md │ │ │ └── images │ │ │ │ ├── 2d_lstm_1.png │ │ │ │ ├── 3D-GridLSTM.png │ │ │ │ ├── CWRNN.png │ │ │ │ ├── CWRNN.pptx │ │ │ │ ├── DilatedRNN.png │ │ │ │ ├── DilatedRNN1.png │ │ │ │ ├── GridLSTM-NMT.png │ │ │ │ ├── HM-LSTM-pre-activation.png │ │ │ │ ├── LSTM_equation.png │ │ │ │ ├── active_modules.png │ │ │ │ ├── active_modules.pptx │ │ │ │ ├── boundary_state.png │ │ │ │ ├── boundary_state.pptx │ │ │ │ ├── hardsigmoid.png │ │ │ │ ├── hm-lstm-cell-update.png │ │ │ │ ├── hm-lstm-output-hidden.png │ │ │ │ ├── multi-dimensioanl-rnn.png │ │ │ │ ├── multi-dimensional-multi-directional-context.png │ │ │ │ ├── wh.png │ │ │ │ └── wh.pptx │ │ ├── pre-training │ │ │ ├── ALBERT.md │ │ │ ├── BERT.md │ │ │ ├── ELMo.md │ │ │ ├── GPT.md │ │ │ ├── README.md │ │ │ ├── ULM-FiT.md │ │ │ ├── XLNet.md │ │ │ ├── images │ │ │ │ ├── ELMo.png │ │ │ │ ├── GPT-auxiliary-training-object.png │ │ │ │ ├── STLR-figure.png │ │ │ │ ├── ULM-FiT-STLR.png │ │ │ │ ├── biLM-ELMo.png │ │ │ │ ├── dataset-ULM-FiT.png │ │ │ │ ├── elmo-vectors.png │ │ │ │ ├── example-language-inference.jpg │ │ │ │ ├── example-of-auxiliary-prediction-taks.png │ │ │ │ ├── highlight-bert-LM1.png │ │ │ │ ├── highlight-bert-LM2.png │ │ │ │ ├── highlight-bert-LM3.png │ │ │ │ ├── highlight-bert-input.png │ │ │ │ ├── highway.png │ │ │ │ ├── highway2.png │ │ │ │ ├── how-bert-comes-out.png │ │ │ │ ├── input-of-bert.png │ │ │ │ ├── intro.png │ │ │ │ ├── task-specific-input-transformation.png │ │ │ │ └── transformer-block.png │ │ │ ├── learning-language-representation-slides.md │ │ │ └── learning-language-representation-slides.pdf │ │ └── x-former │ │ │ └── README.md │ ├── structured_state_space_models │ │ ├── README.md │ │ └── maba │ │ │ └── README.md │ └── vision │ │ ├── README.md │ │ ├── ResNeXt │ │ └── README.md │ │ ├── SSD │ │ ├── SSD.md │ │ └── images │ │ │ ├── SSD.png │ │ │ └── SSD2.png │ │ ├── SqueezeNet │ │ └── README.md │ │ └── Xception │ │ └── README.md ├── dl-systems │ ├── A_computational_model_for_TensorFlow.md │ ├── AutoGraph.md │ ├── Beyond_Data_and_Model_Parallelism_for_Deep_Neural_Networks.md │ ├── Cavs_An_Efficient_Runtime_System_for_Dynamic_Neural_Networks.md │ ├── JANUS.md │ ├── JAX.md │ ├── Machine_Learning_Systems_are_Stuck_in_a_Rut.md │ ├── Pydron.md │ ├── TensorFlow_Eager.md │ ├── images │ │ ├── Pydron.png │ │ ├── SSA_translation.png │ │ ├── functioin_call_translation.png │ │ ├── tf_eager_01.png │ │ ├── tf_eager_02.png │ │ └── tf_eager_03.png │ └── tf-cfg-design │ │ ├── Deep_learning_with_dynamic_computation_graphs.md │ │ ├── Dynamic_Control_Flow_in_Large-Scale_Machine_Learning.md │ │ └── tf-while-op-impl.md ├── dl-workload-optimizations │ ├── DeepCPU.md │ ├── Optimizing_RNN_performance │ │ ├── Optimizing_RNN_performance.md │ │ ├── Optimizing_RNN_performance.pdf │ │ └── images │ │ │ ├── multiple_layer_optimization.png │ │ │ ├── pic1.png │ │ │ ├── pic2.png │ │ │ ├── single_cell_optimization.png │ │ │ ├── single_layer_optimization.png │ │ │ └── starting_point.png │ ├── README.md │ └── ShuffleNet_v2.md ├── generalization-of-neural-network │ ├── README.md │ ├── rendered │ │ ├── A_Bayesian_Perspective_on_Generalization_and_Stochastic_Gradient_Descent.pdf │ │ ├── Bayesian_Model_Comparison.pdf │ │ ├── Highly_Scalable_Deep_Learning_Training_System_with_Mixed-Precision.pdf │ │ ├── Large_Batch_Training_of_Convolutional_Networks.pdf │ │ ├── On_Large-Batch_Training_for_Deep_Learning.pdf │ │ └── Train_Longer_Generalize_Better.pdf │ └── sources │ │ ├── A_Bayesian_Perspective_on_Generalization_and_Stochastic_Gradient_Descent.md │ │ ├── Accurate_Large_Minibatch_SGD.md │ │ ├── Bayesian_Model_Comparison.md │ │ ├── Highly_Scalable_Deep_Learning_Training_System_with_Mixed-Precision.md │ │ ├── Large_Batch_Training_of_Convolutional_Networks.md │ │ ├── On_Large-Batch_Training_for_Deep_Learning.md │ │ ├── Train_Longer_Generalize_Better.md │ │ └── images │ │ ├── f1.png │ │ ├── f2.png │ │ ├── fig1.png │ │ ├── figure1.png │ │ ├── figure2.png │ │ ├── insert_bn_after_pool5.png │ │ ├── mixed_precision_with_LARS.png │ │ ├── network_configuration.png │ │ ├── scalability.png │ │ ├── sharpness_metric.png │ │ ├── sharpness_of_minimizers_1.png │ │ ├── sharpness_of_minimizers_2.png │ │ └── warmup_experiments.png ├── large-language-models │ ├── GPT │ │ ├── GPT-models.md │ │ ├── README.md │ │ └── figures │ │ │ ├── Full_GPT_architecture.png │ │ │ └── GPT-3-model-size.png │ ├── README.md │ ├── fast-attention │ │ ├── Flash-Attention.pdf │ │ └── README.md │ ├── llm_inference.pdf │ ├── transformer-optimizations │ │ ├── FlexGen.md │ │ ├── README.md │ │ ├── RMS_layernorm.md │ │ ├── figures │ │ │ ├── block-schedule-with-overlap.png │ │ │ ├── computeation-graph-of-llm-inference.png │ │ │ ├── pre-post-layer-normalization-in-transformer.png │ │ │ └── two-different-schedules.png │ │ └── welford_algorithm_and_layer_norm.md │ └── whisper │ │ ├── README.md │ │ └── figures │ │ ├── whisper-model-size.png │ │ └── whisper_overview.png ├── leading-edge-ai │ ├── Capsules │ │ ├── Dynamic_Routing_between_Capsule.md │ │ └── README.md │ ├── README.md │ └── RIM.md ├── miscellanea │ ├── Neural_Ordinary_Differential_Equations.md │ └── README.md ├── ml-with-discrete-variables │ ├── README.md │ └── Straight-throughEstimator.md ├── normalization-in-NN │ ├── L2_Regularization_versus_Batch_and_Weight_Normalization │ │ └── L2_Regularization_versus_Batch_and_Weight_Normalization.md │ ├── Layer_Normalization │ │ ├── layer_normalization.md │ │ └── layer_normalization.pdf │ ├── README.md │ ├── Weight_Normalization │ │ ├── weight_normalization.md │ │ └── weight_normalization.pdf │ └── optimization │ │ ├── Hessian_and_DeepLearning_Optimizaiton.md │ │ └── Hessian_and_DeepLearning_Optimizaiton.pdf ├── parallel-computing │ ├── IRs │ │ ├── lift.md │ │ └── nova.md │ ├── Nesl │ │ ├── VCODE.md │ │ ├── images │ │ │ └── nesl-1.png │ │ └── nesl.md │ ├── README.md │ ├── data_parallel_language │ │ └── README.md │ ├── execution_model │ │ ├── ActorModel.md │ │ ├── CSP.md │ │ └── MessagePassing.md │ └── programming_model │ │ ├── DMLL.md │ │ ├── README.md │ │ ├── collection_orientated_languages.md │ │ └── images │ │ ├── DMLL-comparison.png │ │ ├── co-ori-lang-1.png │ │ ├── mimd.gif │ │ ├── simd.gif │ │ └── vcode-instruction.png ├── partial_aggregation │ ├── README.md │ └── figures │ │ ├── decomposable_function_1.png │ │ ├── decomposable_function_2.png │ │ ├── decomposable_function_3.png │ │ ├── decomposable_function_4.png │ │ ├── execution_plan_1.png │ │ └── execution_plan_2.png ├── programming-language │ ├── About_programming_language.md │ ├── Glossary │ │ ├── README.md │ │ ├── basic_concepts.md │ │ ├── images │ │ │ └── 1920px-Tree_edges.svg.png │ │ ├── program_analysis.md │ │ ├── programming_paradigm.md │ │ └── type.md │ ├── Nominative_and_structure_type.md │ ├── PL_Design.md │ ├── ProgrammingParadigms.md │ ├── README.md │ ├── SSA │ │ ├── README.md │ │ ├── SSA.md │ │ ├── Simple_and_Efficient_Construction_of_Static_Single_Assignment_Form.md │ │ └── images │ │ │ ├── SSA_example1.1.png │ │ │ ├── SSA_example1.2.png │ │ │ └── SSA_example1.3.png │ ├── Types.md │ └── abstract_binding_tree.md ├── tensor_operations │ ├── README.md │ ├── concepts.tex │ ├── constructs.tex │ ├── images │ │ ├── mm_example.png │ │ ├── nested_tensorarray.png │ │ ├── tensor.png │ │ └── transformer.png │ ├── ir.tex │ ├── item_access.tex │ ├── main.pdf │ ├── main.tex │ ├── nn.tex │ ├── optimization.tex │ ├── shape_operation.tex │ ├── structure.tex │ ├── tensorarray_creation.tex │ ├── transformer.tex │ └── vectorization.tex └── type-systems │ ├── README.md │ ├── basic-concepts.md │ └── notations │ ├── README.md │ ├── bussproofs.sty │ ├── contents │ └── kinding.tex │ ├── formal-grammar.sty │ ├── main.pdf │ ├── main.tex │ └── structure.tex ├── reinforcement_learning ├── README.md └── basic_concepts │ ├── basic_concepts_about_reinforcement_learning.pdf │ └── basic_concepts_about_reinforcement_learning.ppt ├── text_generation_for_gitchat ├── README.md ├── pic │ ├── pic1.one-hot.png │ ├── pic10.highway.png │ ├── pic11.generate_text_from_language_model.png │ ├── pic12.encoder_decoder.png │ ├── pic13.neural_turing_machine.png │ ├── pic2.word_embedding.png │ ├── pic3.rnn.png │ ├── pic4.bp_through_all_nodes.png │ ├── pic5.bp_through_shortcut.png │ ├── pic6.lstm.png │ ├── pic7.gru.png │ ├── pic8.deep_rnn.png │ └── pic9.residual_block.png ├── text_generation.pdf └── trans_2_html.sh └── tiled_efficient_attention ├── README.md ├── README.pdf ├── figures ├── cal_p.png ├── chunk_form_parallelism.png ├── chunk_recurrent.png ├── first_kv.png ├── fused_chunk_gla_fwd_kernel.png ├── fwd_decay_cumsum.png ├── fwd_inner_chunk.png ├── gated_linear_attention.pptx ├── gated_linear_attention_layer.png ├── gla_data_accessed.png ├── gla_equation.png ├── last_decay.png └── ~$gated_linear_attention.pptx ├── main.py └── model ├── __init__.py ├── chunk.py ├── chunk_fuse.py ├── chunk_util.py ├── configuration.py ├── gla.py ├── naive.py ├── recurrent_fuse.py └── utils.py /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | - repo: https://github.com/pre-commit/mirrors-yapf.git 2 | sha: v0.16.0 3 | hooks: 4 | - id: yapf 5 | files: \.py$ 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | sha: a11d9314b22d8f8c7556443875b731ef05965464 8 | hooks: 9 | - id: check-merge-conflict 10 | - id: check-symlinks 11 | - id: detect-private-key 12 | files: (?!.*paddle)^.*$ 13 | - id: end-of-file-fixer 14 | files: \.md$ 15 | - id: trailing-whitespace 16 | files: \.md$ 17 | - repo: https://github.com/Lucas-C/pre-commit-hooks 18 | sha: v1.0.1 19 | hooks: 20 | - id: forbid-crlf 21 | files: \.md$ 22 | - id: remove-crlf 23 | files: \.md$ 24 | - id: forbid-tabs 25 | files: \.md$ 26 | - id: remove-tabs 27 | files: \.md$ 28 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = pep8 3 | column_limit = 80 4 | -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/Developer's_Guide_to_Paddle_Fluid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/Developer's_Guide_to_Paddle_Fluid.pdf -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/1.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/2.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/3.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/4.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/LoDTensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/LoDTensor.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/compile_run_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/compile_run_time.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/executor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/executor.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/fluid-compiler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/fluid-compiler.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/fluid_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/fluid_examples.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/fluid_module_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/fluid_module_1.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/fluid_module_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/fluid_module_2.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/graph_construction_example_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/graph_construction_example_all.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/layer.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/operator1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/operator1.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/operator2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/operator2.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/place.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/place.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/print_fluid_program.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/print_fluid_program.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/program_desc1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/program_desc1.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/program_desc2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/program_desc2.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/raw_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/raw_input.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/scope_variable_tensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/scope_variable_tensor.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/sorted_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/sorted_input.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/transpiler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/transpiler.png -------------------------------------------------------------------------------- /Fluid/developer's_guide_for_Fluid/images/user_interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/developer's_guide_for_Fluid/images/user_interface.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/1.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/2.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/3.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/4.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/attention.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/raw_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/raw_input.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/sorted_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/sorted_input.png -------------------------------------------------------------------------------- /Fluid/nmt_on_fluid/images/user_interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/Fluid/nmt_on_fluid/images/user_interface.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | My learning notes. 2 | -------------------------------------------------------------------------------- /TeXNotes/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/.gitignore -------------------------------------------------------------------------------- /TeXNotes/00_templates/assignment/contents.tex: -------------------------------------------------------------------------------- 1 | 从global向shared memory传输数据和从shared memory向register file传输数据总觉得有某种微妙的不同。 2 | 3 | 我们首先规定\colorbox{hl}{text}“拷贝”:是将数据$D$ 从place $A$完全地移动到place $B$存储为$D'$。$D'$可以是$D$的一个permutation,也就是说$D'$中的元素和$D$中的元素一一对应,元素数目不变,而顺序可以不相同。 4 | 5 | global memory是外存,shared memory,cache和RF是片上存储,后者的capacity总是小于前者。 6 | 7 | 给定一个AccessMap将其完全地翻译成实现,不是一个单纯的copy macro kernel问题。这里面涉及了(1)分数据(根据capacity决定一次空间执行的数据块的大小);(2)拷贝;(3)时间上重复执行的执行顺序问题; 8 | 9 | 非常朴素地想,计算过程涉及到数据和多线程两种要素。对于数据,我们总是提供逻辑和物理两种视角: 10 | \begin{enumerate} 11 | \setlength{\itemsep}{-0.1cm} 12 | \item high-dimensional array-like的\colorbox{hl}{逻辑视角};能够用\textcolor{blue}{高维逻辑indices寻址},逻辑视角能够改善可编程性,并且隔离与hardware强相关的实现选择问题 13 | \item \colorbox{hl}{物理视角} 14 | \item Layout是logical high-dimensional indices和物理寻址之间的映射函数 15 | \end{enumerate} 16 | 17 | \begin{figure}[h] 18 | \centering 19 | \includegraphics[width=0.8\textwidth]{figures/shared_2_rf_with_ldmatrix.pdf} 20 | \caption{使用ldmatrix指令从shared memory向regisger file加载数据。} 21 | \end{figure} 22 | 23 | 使用ldmatrix从shared memory加载数据到每个线程thread local的寄存器,warp中的每32线程构成一个 $2 \times 2$的线程tile,每个tile内部8线程, 24 | \textcolor{red}{调用ldmatrix的时候每个线程都需要传入一个shared memory指针},然后单线程读取shared memory中连续128 bits。ldmatrix一次执行最大读取 $16 \times 16$大小的半精度矩阵。 25 | 26 | 在实现中,每个线程都需要正确地计算出自己要读取的shared memory位置的指针偏移。 27 | 28 | ldmatrix的一次执行32个线程能一次性读取$16 \times 16$大小的$2D$ tile,一次执行单线程数据tile大小$1 \times 8$,如果将$n$次执行ldmatrix的结果都保留在thread local的寄存器上,单线程数据块的大小是$1 \times \left( n \times 8 \right)$。 29 | 30 | 所以我们将目标的layout配置成$(1, n*8)$ 31 | 32 | \newpage 33 | 34 | \begin{enumerate} 35 | \setlength{\itemsep}{-0.1cm} 36 | \item 第一个嵌套层级:一个shared memory数据块要转化为多次$(m, n)$次对copy\_2d\_tile\_s2r在时间上的调用,要用for循环issue出去。每个处理一个小分块 $\mathcal{T}$。 37 | \item 第二个嵌套层级:一个copy\_2d\_tile\_s2r 38 | \end{enumerate} -------------------------------------------------------------------------------- /TeXNotes/00_templates/assignment/figures/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/00_templates/assignment/figures/figure1.png -------------------------------------------------------------------------------- /TeXNotes/00_templates/assignment/references.bib: -------------------------------------------------------------------------------- 1 | @article{lamport1974parallel, 2 | title={The parallel execution of do loops}, 3 | author={Lamport, Leslie}, 4 | journal={Communications of the ACM}, 5 | volume={17}, 6 | number={2}, 7 | pages={83--93}, 8 | year={1974}, 9 | publisher={ACM New York, NY, USA} 10 | } 11 | -------------------------------------------------------------------------------- /TeXNotes/00_templates/slides/slides.tex: -------------------------------------------------------------------------------- 1 | \documentclass {beamer} 2 | 3 | \input {structure.tex} 4 | 5 | \title[] {Ying's Slides} 6 | \subtitle{A short story} 7 | \author {Ying Cao} 8 | \institute {} 9 | \date {\today} 10 | 11 | \begin {document} 12 | 13 | \AtBeginSection[] 14 | { 15 | \begin{frame} 16 | \frametitle{Table of Contents} 17 | \tableofcontents[currentsection] 18 | \end{frame} 19 | } 20 | 21 | \begin {frame} 22 | \titlepage 23 | \end {frame} 24 | 25 | \section {Section 1} 26 | 27 | \begin {frame} {Title frame 1} 28 | 29 | In this slide, some important text will be 30 | \alert{highlighted} because it's important. 31 | Please, don't abuse it. 32 | 33 | \begin {block} {Block blue} 34 | \begin {itemize} 35 | \item Item 1 36 | \item Item 2 37 | \item Item 3 38 | \end {itemize} 39 | \end {block} 40 | \end {frame} 41 | 42 | \section {Section 2} 43 | \begin {frame} {Title Frame 2} 44 | \[\ support(X \to Y) = p(X \cup Y) = \frac { {n(X \cup Y)}} {N}\] 45 | \end {frame} 46 | 47 | \end {document} -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/contents/miscellany.tex: -------------------------------------------------------------------------------- 1 | 若存在可逆矩阵$P$,使得一个关于矩阵$A$的如下等式成立: 2 | $$A = (PDP)^{-1}$$ 3 | 4 | 则称符合这样关系的矩阵$A$与$D$是相似矩阵,记作:$A \sim D$,则$A$的幂可以通过求矩阵$D$的幂求得 5 | 6 | $$A^{m} = (PDP^{-1})^{m} = (PDP^{-1})(PDP^{-1})\dots(PDP^{-1})=PD^{m}P$$ 7 | 8 | \textcolor{red}{如果我们能够得出$D$是一个很简单的矩阵,例如对角矩阵,那么就可以很简单的计算出$A$的幂值}。 9 | 然而,一般的矩阵在实数域不一定能对角化,然而几乎所有矩阵都能在复数域对角化\cite{lru-kexue}。 10 | 于是$A$总能写成: 11 | 12 | \begin{align*} 13 | A=P\Lambda P^{-1} & A^{m} = P\Lambda^{m} P^{-1} 14 | \end{align*} -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/contents/parallel_rnn.tex: -------------------------------------------------------------------------------- 1 | \subsection{典型代表} 2 | 3 | \subsubsection{RWKV\cite{peng2023rwkv}} 4 | 5 | \subsubsection{LRU(Linear Recurrent Unit)\cite{orvieto2023resurrecting}} -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/SSM-overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/SSM-overview.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/attention-train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/attention-train.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/attention.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/cond_branchs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/cond_branchs.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/figures.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/figures.pptx -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/mamba-mixer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/mamba-mixer.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/mamba-model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/mamba-model.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/mamba-ssm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/mamba-ssm.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/rnn_layer1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/rnn_layer1.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/rnn_layer2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/rnn_layer2.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/scan_step.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/scan_step.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/signal_flow_structure_of_stacked_rnn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/signal_flow_structure_of_stacked_rnn.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/stacked_rnns1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/stacked_rnns1.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/figures/stacked_rnns2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/figures/stacked_rnns2.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Efficient_attention_and_RNNs/main.pdf -------------------------------------------------------------------------------- /TeXNotes/Efficient_attention_and_RNNs/main.tex: -------------------------------------------------------------------------------- 1 | %!TEX program = xelatex 2 | \documentclass[UTF8]{ctexart} 3 | 4 | \input{structure.tex} 5 | \input{define_language.tex} 6 | 7 | \title{RNNs and scan} 8 | 9 | \begin{document} 10 | 11 | \tableofcontents 12 | \thispagestyle{empty} 13 | \newpage 14 | \setcounter{page}{1} 15 | 16 | \bibliographystyle{plain} 17 | 18 | \noindent 19 | \linespread{1.2} 20 | \selectfont 21 | \setlength{\topskip}{0ex} 22 | \setlength{\parskip}{1ex} 23 | \setlength{\lineskip}{1em} 24 | 25 | \section{Transformer和线性RNN} 26 | \input{contents/linear_rnn.tex} 27 | 28 | \section{SSM(state-space model)} 29 | \input{contents/ssm.tex} 30 | 31 | \newpage 32 | \section{并行RNN} 33 | \input{contents/parallel_rnn.tex} 34 | 35 | \newpage 36 | \section{General non-linear recurrence 的并行计算问题} 37 | \input{contents/stacked_rnns.tex} 38 | \input{contents/loop_bounds.tex} 39 | 40 | \begin{appendices} 41 | \input{contents/miscellany.tex} 42 | \end{appendices} 43 | 44 | \newpage 45 | \bibliography{references.bib} 46 | \end{document} 47 | -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/figures/attention_offset-a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Flash_Attention/figures/attention_offset-a.pdf -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/figures/attention_offset-b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Flash_Attention/figures/attention_offset-b.pdf -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/figures/fuse_reduce.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Flash_Attention/figures/fuse_reduce.pdf -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/figures/logsoftmax_expression_tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Flash_Attention/figures/logsoftmax_expression_tree.pdf -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Flash_Attention/main.pdf -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Flash Attention} 5 | % \author{Ying Cao} 6 | % \date{\today} 7 | 8 | \begin{document} 9 | \bibliographystyle{plain} 10 | 11 | % \maketitle % Print the title 12 | % \tableofcontents 13 | 14 | \noindent 15 | \linespread{1.2} 16 | \selectfont 17 | \setlength{\topskip}{0ex} 18 | \setlength{\parskip}{1ex} 19 | \setlength{\lineskip}{1em} 20 | 21 | %--------------------------------------------------------------- 22 | % unnumbered section 23 | %--------------------------------------------------------------- 24 | 25 | \section{Flash Attention} 26 | \input{contents/flash_attention.tex} 27 | 28 | \newpage 29 | \section{Online Normalized Softmax} 30 | \input{contents/online_softmax.tex} 31 | 32 | \newpage 33 | \section{CTA Offset} 34 | \input{contents/CTA_offset.tex} 35 | 36 | \newpage 37 | \section{I/O Complexity Analysis} 38 | \input{contents/IO_complexity.tex} 39 | 40 | \newpage 41 | \section{Fuse Consecutive Aggregations} 42 | \input{contents/fuse_reduce.tex} 43 | 44 | \bibliography{references.bib} 45 | \end{document} 46 | -------------------------------------------------------------------------------- /TeXNotes/Flash_Attention/references.bib: -------------------------------------------------------------------------------- 1 | @article{DBLP:journals/corr/abs-2112-05682, 2 | author = {Markus N. Rabe and 3 | Charles Staats}, 4 | title = {\href{https://arxiv.org/pdf/2112.05682.pdf}{Self-attention Does Not Need O(n\({}^{\mbox{2}}\)) Memory}}, 5 | journal = {CoRR}, 6 | volume = {abs/2112.05682}, 7 | year = {2021}, 8 | url = {https://arxiv.org/abs/2112.05682}, 9 | eprinttype = {arXiv}, 10 | eprint = {2112.05682}, 11 | timestamp = {Tue, 14 Dec 2021 14:21:31 +0100}, 12 | biburl = {https://dblp.org/rec/journals/corr/abs-2112-05682.bib}, 13 | bibsource = {dblp computer science bibliography, https://dblp.org} 14 | } 15 | 16 | @misc{lse-trick, 17 | author = {Gregory Gundersen}, 18 | year = {2020}, 19 | howpublished = "\url{https://gregorygundersen.com/blog/2020/02/09/log-sum-exp/}", 20 | urldate = {February 9, 2020}, 21 | title = {The Log-Sum-Exp Trick}, 22 | note = "[Online; accessed 17-April-2023]" 23 | } -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/FlashAttention_formalization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/FlashAttention_formalization.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/Transformer-block.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/Transformer-block.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/Transformer-block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/Transformer-block.png -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/attention.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/attention.pptx -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree1.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree2.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/attention_expression_tree3.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/fused_mha.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/fused_mha.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/fused_transformer_block.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/fused_transformer_block.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/map_and_aggregate.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/map_and_aggregate.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/multi-scale-attn1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/multi-scale-attn1.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/multi-scale-attn2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/multi-scale-attn2.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/figures/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/figures/transformer.png -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Formalize_Flash_Attention/main.pdf -------------------------------------------------------------------------------- /TeXNotes/Formalize_Flash_Attention/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Flash Attention} 5 | % \author{Ying Cao} 6 | % \date{\today} 7 | 8 | \begin{document} 9 | \bibliographystyle{plain} 10 | 11 | \maketitle % Print the title 12 | \tableofcontents 13 | 14 | \clearpage 15 | \noindent 16 | \linespread{1.2} 17 | \selectfont 18 | \setlength{\topskip}{0ex} 19 | \setlength{\parskip}{1ex} 20 | \setlength{\lineskip}{1em} 21 | 22 | %--------------------------------------------------------------- 23 | % unnumbered section 24 | %--------------------------------------------------------------- 25 | 26 | \noindent $::$ is read as "have a type of". 27 | 28 | \noindent $\rightarrow$ is read as "maps to". 29 | 30 | \section{Background: The Computational Process of Reduce and Map}\label{sec1} 31 | \input{contents/background.tex} 32 | 33 | \section{A Generalized \textit{Broadcast-and-then-Aggregate} Operation} 34 | \input{contents/map_and_then_aggreage.tex} 35 | 36 | \section{Block Execution of a Chain of \textit{Broadcast-and-then-Aggregate}} 37 | \input{contents/fused_chained_map_and_then_aggregate.tex} 38 | 39 | \section{The Transformer Block} 40 | \input{contents/beyond_flash_attention.tex} 41 | \clearpage 42 | \begin{appendices} 43 | \input{contents/welford_algorithm.tex} 44 | \input{contents/backward.tex} 45 | \end{appendices} 46 | 47 | \clearpage 48 | \bibliographystyle{abbrv} 49 | \bibliography{references.bib} 50 | 51 | \end{document} -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/BERT-performance-breakdown.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/BERT-performance-breakdown.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/ILP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/ILP.png -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/LLM-inference-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/LLM-inference-graph.png -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/LLM.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/LLM.xlsx -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/MHA-variable-seq-length.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/MHA-variable-seq-length.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/SBI-GEMM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/SBI-GEMM.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/block-schedule-algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/block-schedule-algorithm.png -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/block-schedule.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/block-schedule.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/byte-transformer-overview2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/byte-transformer-overview2.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/bytetransformer-padding-free-input-batch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/bytetransformer-padding-free-input-batch.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/bytetransformer_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/bytetransformer_overview.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/different-batch-size-in-pipeline-parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/different-batch-size-in-pipeline-parallelism.png -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/fused_transformer_block-deepspeed-inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/fused_transformer_block-deepspeed-inference.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/grouped-gemm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/grouped-gemm.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/grouped-mha.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/grouped-mha.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/llm-inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/llm-inference.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/llm-inference.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/llm-inference.pptx -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/logo.jpeg -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/memory-system.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/memory-system.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/pipeline-schedule-deep-speed-inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/pipeline-schedule-deep-speed-inference.png -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/two-stages-in-llm-inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/two-stages-in-llm-inference.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/variable-length-mha.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/variable-length-mha.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/images/zero-padding-algorithm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/images/zero-padding-algorithm.pdf -------------------------------------------------------------------------------- /TeXNotes/LLM_inference/llm_inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/LLM_inference/llm_inference.pdf -------------------------------------------------------------------------------- /TeXNotes/Parallel_Execution_of_DO_Loops/example.tex: -------------------------------------------------------------------------------- 1 | This is the implementation of Grid LSTM. 2 | 3 | \begin{lstlisting}[language=Python] 4 | # data parallelism: iterate over samples in a batch. 5 | for sample_id in range(0, batch_size, 1): 6 | x = src_array_batch[sample_id] 7 | y = trg_array_batch[sample_id] 8 | 9 | src_length = x.size()[0] 10 | trg_length = y.size()[0] 11 | 12 | # dim 1: stack Grid LSTM Cell to form depth. 13 | for d in range(0, depth, 1): 14 | # dim 2: iterate over source sequence length. 15 | for i in range(1, src_length + 1, 1): 16 | # dim 3: iterate over target sequence length. 17 | for j in range(1, trg_length + 1, 1): 18 | cell_x = cells[d][0] 19 | cell_y = cells[d][1] 20 | 21 | output_d = outputs[sample_id][d] 22 | 23 | if d == 0: 24 | x_t = x[i - 1, :].view(1, input_dim) 25 | y_t = y[j - 1, :].view(1, input_dim) 26 | else: 27 | x_t = outputs[sample_id][d - 1][i][j][0][0] 28 | y_t = outputs[sample_id][d - 1][i][j][1][0] 29 | states_x = output_d[i][j - 1][0] 30 | states_y = output_d[i - 1][j][1] 31 | 32 | h_x_prev, c_x_prev = states_x 33 | h_y_prev, c_y_prev = states_y 34 | 35 | h = torch.cat((h_x_prev, h_y_prev), dim=1) 36 | h_x, c_x = cell_x(x_t, (h, c_x_prev)) 37 | h_y, c_y = cell_y(y_t, (h, c_y_prev)) 38 | 39 | output_d[i][j][0].append(h_x) # hidden for direction x 40 | output_d[i][j][0].append(c_x) # cell for direction x 41 | 42 | output_d[i][j][1].append(h_y) # hidden for direction y 43 | output_d[i][j][1].append(c_y) # cell for direction y 44 | 45 | \end{lstlisting} 46 | -------------------------------------------------------------------------------- /TeXNotes/Parallel_Execution_of_DO_Loops/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/TeXNotes/Parallel_Execution_of_DO_Loops/main.pdf -------------------------------------------------------------------------------- /TeXNotes/Parallel_Execution_of_DO_Loops/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Note for 5 | \textit{\href{https://www.microsoft.com/en-us/research/wp-content/uploads/2016/12/The-Parallel-Execution-of-DO-Loops.pdf} 6 | {The Parallel Execution of DO Loops}}} 7 | \author{Ying Cao} 8 | \date{\today} 9 | 10 | \begin{document} 11 | 12 | \maketitle 13 | \tableofcontents 14 | 15 | \section{Basics} 16 | \input{basics.tex} 17 | 18 | \section{The hyperplane method} 19 | \input{hyperplane.tex} 20 | 21 | { 22 | \small 23 | \raggedright 24 | \bibliographystyle{ieeetr} 25 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 26 | \begin{spacing}{1} 27 | \bibliography{references.bib} 28 | \end{spacing} 29 | } 30 | \end{document} 31 | -------------------------------------------------------------------------------- /TeXNotes/Parallel_Execution_of_DO_Loops/references.bib: -------------------------------------------------------------------------------- 1 | @article{lamport1974parallel, 2 | title={The parallel execution of do loops}, 3 | author={Lamport, Leslie}, 4 | journal={Communications of the ACM}, 5 | volume={17}, 6 | number={2}, 7 | pages={83--93}, 8 | year={1974}, 9 | publisher={ACM New York, NY, USA} 10 | } 11 | @Misc{EliBendersky, 12 | howpublished = {\url{https://eli.thegreenplace.net/2018/affine-transformations/}}, 13 | note = {Accessed Feburary 4, 2020}, 14 | title = {Affine transformations}, 15 | author = {Eli Bendersky} 16 | } 17 | @Misc{Diophantinewiki, 18 | howpublished = {\url{https://en.wikipedia.org/wiki/Diophantine_equation}}, 19 | note = {Accessed Feburary 4, 2020}, 20 | title = {Diophantine equation}, 21 | author = {} 22 | } 23 | @inproceedings{irigoin1988supernode, 24 | title={Supernode partitioning}, 25 | author={Irigoin, Fran{\c{c}}ois and Triolet, Remi}, 26 | booktitle={Proceedings of the 15th ACM SIGPLAN-SIGACT symposium on Principles of programming languages}, 27 | pages={319--329}, 28 | year={1988} 29 | } 30 | @article{wolf1991loop, 31 | title={A loop transformation theory and an algorithm to maximize parallelism}, 32 | author={Wolf, Michael E and Lam, Monica S}, 33 | journal={IEEE Transactions on Parallel \& Distributed Systems}, 34 | number={4}, 35 | pages={452--471}, 36 | year={1991}, 37 | publisher={IEEE} 38 | } 39 | @book{mordell1969diophantine, 40 | title={Diophantine equations}, 41 | author={Mordell, Louis Joel}, 42 | year={1969}, 43 | publisher={Academic Press} 44 | } 45 | -------------------------------------------------------------------------------- /code_reading/README.md: -------------------------------------------------------------------------------- 1 | [TBD] 2 | -------------------------------------------------------------------------------- /code_reading/pet/README.md: -------------------------------------------------------------------------------- 1 | - [Clang AST](clang_pulgin.md) 2 | - [basics](basics.md) 3 | - [extract SCoP](extract_scop.md) 4 | - [tree to SCoP](tree2scop.md) 5 | -------------------------------------------------------------------------------- /code_reading/pet/clang_pulgin.md: -------------------------------------------------------------------------------- 1 | See [this example](https://github.com/llvm/llvm-project/blob/master/clang/examples/PrintFunctionNames/PrintFunctionNames.cpp) to implement a Clang plugin. 2 | 3 | # References 4 | 5 | 1. [How to write RecursiveASTVisitor based ASTFrontendActions](https://clang.llvm.org/docs/RAVFrontendAction.html) 6 | 1. [Clang tutorial part 1: introduction](https://kevinaboos.wordpress.com/2013/07/23/clang-tutorial-part-i-introduction/) 7 | 1. [Clang Tutorial Part II: LibTooling Example](https://kevinaboos.wordpress.com/2013/07/23/clang-tutorial-part-i-introduction/) 8 | -------------------------------------------------------------------------------- /engineering_a_compiler/scanner.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Scanner](#scanner) 4 | 5 | 6 | 7 | # Scanner 8 | -------------------------------------------------------------------------------- /julia_learning_notes/Basics/03_JPL_metaprogramming.md: -------------------------------------------------------------------------------- 1 | # [Metaprogramming](https://docs.julialang.org/en/release-0.4/manual/metaprogramming/) 2 | 3 | * Julia represents its own code as a data structure of the language itself. 4 | * allow sophisticated code generation without extra build steps 5 | * allow true Lisp-style macros _**operating at the level of abstract syntax trees**_. 6 | * powerful [reflection](https://en.wikipedia.org/wiki/Reflection_%28computer_programming%29) capabilities 7 | 8 | ## Program representation 9 | 10 | * every Julia program starts life as a string 11 | * parse (I understand this function as lexical analysis) each string into an object called an expression, represented by the Julia's type `Expr`. 12 | * `Expr` objects contain three parts: 13 | 1. `Symbol` 14 | * In the context of an expression, symbols are used to indicate access to variables. 15 | * when an expression is evaluated, a symbol is replaced with the value bound to that symbol in the appropriate scope. 16 | 2. the expression arguments 17 | 3. the expression result type 18 | -------------------------------------------------------------------------------- /julia_learning_notes/Basics/Good_references.md: -------------------------------------------------------------------------------- 1 | 1. [The Julia Language Challenge](https://nextjournal.com/sdanisch/the-julia-challenge) 2 | 1. [The Julia Challenge in C++](https://medium.com/@wolfv/the-julia-challenge-in-c-21272d36c002) 3 | -------------------------------------------------------------------------------- /julia_learning_notes/Basics/Type_inference/README.md: -------------------------------------------------------------------------------- 1 | # Inference Convergence Algorithm 2 | 3 | 1. [Inference Convergence](https://juliacomputing.com/blog/2016/04/04/inference-convergence.html) 4 | 1. [Inference Convergence Algorithm in Julia - Revisited](https://juliacomputing.com/blog/2017/05/15/inference-converage2.html) 5 | 1. [Notes](Type_inference.pdf) 6 | -------------------------------------------------------------------------------- /julia_learning_notes/Basics/Type_inference/Type_inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/julia_learning_notes/Basics/Type_inference/Type_inference.pdf -------------------------------------------------------------------------------- /julia_learning_notes/Basics/WhyJulia/README.md: -------------------------------------------------------------------------------- 1 | 1. [Why Julia](https://ucidatascienceinitiative.github.io/IntroToJulia/Html/WhyJulia) 2 | 1. [Notes](whyJulia.pdf) 3 | 4 | --- 5 | 6 | ### What is [Type-stable](https://docs.julialang.org/en/v1.2-dev/manual/faq/#man-type-stability-1) 7 | 8 | _**The type of the output is predictable from the types of the inputs.**_ In particular, it means that the type of the output cannot vary depending on the values of the inputs. 9 | 10 | The following codes are type-unstable: 11 | 12 | ```julia 13 | function unstable(flag::Bool) 14 | if flag 15 | return 1 16 | else 17 | return 1.0 18 | end 19 | end 20 | ``` 21 | 22 | _**Julia can't predict the return type of function that is type unstable at compile-time, making generation of fast machine code difficult.**_ 23 | -------------------------------------------------------------------------------- /julia_learning_notes/Basics/WhyJulia/whyJulia.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/julia_learning_notes/Basics/WhyJulia/whyJulia.pdf -------------------------------------------------------------------------------- /julia_learning_notes/Basics/images/gpu_julia_packages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/julia_learning_notes/Basics/images/gpu_julia_packages.png -------------------------------------------------------------------------------- /julia_learning_notes/README.md: -------------------------------------------------------------------------------- 1 | Learning notes of the Julia programming language. 2 | -------------------------------------------------------------------------------- /julia_learning_notes/Zygote/code_snippets/hello_world_test.jl: -------------------------------------------------------------------------------- 1 | xs = [fill(1.1, 3), fill(2.2, 3)]; 2 | 3 | # using Pkg 4 | # Pkg.activate(".") 5 | 6 | # zero(x::Array{Float64,1}) = [zero(x) for x in x] 7 | # 8 | # function case1(xs) 9 | # h = xs[1][1] 10 | # # sum(h) 11 | # end 12 | # 13 | # Zygote.gradient(case1, xs) 14 | 15 | 16 | using Pkg 17 | Pkg.activate(".") 18 | using Zygote 19 | 20 | function case1(xs) 21 | h = xs[1] 22 | # for i in 2:length(xs) 23 | # h = h .* xs[i] 24 | # end 25 | sum(h) 26 | end 27 | @show case1(xs) 28 | 29 | Zygote.gradient(case1, xs) 30 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/README.md: -------------------------------------------------------------------------------- 1 | For test only. The codes are not optimized and ugly. 2 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/README.md: -------------------------------------------------------------------------------- 1 | Extremely naive implementation. Codes are not optimized. They are ugly and redundant. 2 | For Test ONLY. 3 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_cpu_test/README.md: -------------------------------------------------------------------------------- 1 | The codes are only tested under Julia v0.7-beta. 2 | 3 | - [`@code_lowered`](docs/code_lowered.txt) 4 | - [`@code_typed`](docs/code_typed.txt) 5 | - [`@code_llvm`](docs/code_llvm.txt) 6 | - [`@code_native`](docs/code_native.txt) 7 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_cpu_test/Recurrent/common.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | abstract type NN 4 | end 5 | 6 | mutable struct Param 7 | n::Integer # input size 8 | d::Integer # output size 9 | w::AbstractArray{AbstractFloat} # learnable weight matrix 10 | dw::AbstractArray{AbstractFloat} # gradients of learnable weight matrix 11 | 12 | Param(n::Integer) = new(n, n, randn(n, n), randn(n, n)) 13 | Param(n::Integer, d::Integer) = new(n, d, randn(n, d), randn(n, d)) 14 | Param(n::Integer, d::Integer, w::Array, dw::Array) = new(n, d, w, dw) 15 | end 16 | 17 | randParam(n::Integer, d::Integer, std::Real=0.1) = Param( 18 | n, d, randn(n, d) * std, zeros(n, d)) 19 | onesParam(n::Integer, d::Integer) = Param(n, d, ones(n, d), zeros(n, d)) 20 | 21 | function softmax(m::Param) 22 | out = Param(m.n, m.d) 23 | maxval = maximum(m.w, 2) 24 | out.w .= exp.(m.w .- maxval) 25 | out.w ./= sum(out.w, 2) 26 | return out 27 | end 28 | 29 | σ(x) = 1.0 / (1.0 + exp(-x)) 30 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_cpu_test/Recurrent/recurrent.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | module Recurrent 3 | 4 | export Param, randParam, onesParam, softmax, sigmoid 5 | export LSTMCell, LSTM_forward 6 | 7 | include("common.jl") 8 | include("lstm.jl") 9 | 10 | end # module 11 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_cpu_test/cpu_test.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | include("Recurrent/recurrent.jl") 3 | using .Recurrent 4 | 5 | srand(1) 6 | 7 | const batch_size = 2 8 | const seq_len = 3 9 | const input_dim = 4 10 | const hidden_dim = 4 11 | 12 | rand_inputs = randn(batch_size * seq_len, input_dim) 13 | 14 | lstm_cell = LSTMCell(input_dim, hidden_dim) 15 | cell_state, hidden_states = LSTM_forward(rand_inputs, lstm_cell, 16 | input_dim, hidden_dim, seq_len) 17 | 18 | println("cell states :") 19 | display(cell_state) 20 | 21 | println("\nhidden states :") 22 | display(hidden_states) 23 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_gpu_test/Recurrent/recurrent.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | module Recurrent 3 | 4 | using CuArrays 5 | 6 | include("lstm.jl") 7 | 8 | export LSTMCell 9 | export σ, lstm_forward 10 | 11 | 12 | end # module 13 | -------------------------------------------------------------------------------- /julia_learning_notes/experiments/lstm_test/naive_gpu_test/gpu_test.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | include("Recurrent/recurrent.jl") 3 | 4 | using .Recurrent 5 | using CuArrays: CuArray 6 | 7 | srand(1) 8 | 9 | const batch_size = 512 10 | const seq_len = 10 11 | const input_dim = 512 12 | const hidden_dim = 512 13 | 14 | rand_inputs_d = CuArray(randn(batch_size * seq_len, input_dim)) 15 | lstm_cell = LSTMCell(input_dim, hidden_dim) 16 | 17 | 18 | for i = 1 : 10 19 | cell_states, hidden_states = lstm_forward(rand_inputs_d, lstm_cell, seq_len) 20 | 21 | # println("cell states : ") 22 | # display(cell_states) 23 | 24 | # println("hidden_states : ") 25 | # display(hidden_states) 26 | end 27 | -------------------------------------------------------------------------------- /paper_notes/Diffusion/README.md: -------------------------------------------------------------------------------- 1 | 2 | 1. "Controlling Text-to-Image Diffusion by Orthogonal Finetuning"[[PDF]](https://arxiv.org/pdf/2306.07280.pdf) 3 | 1. LoRA 4 | 1. ControlNet -------------------------------------------------------------------------------- /paper_notes/Template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [Title](#title) 4 | - [My Takeaways and Some Thoughts](#my-takeaways-and-some-thoughts) 5 | - [Overall](#overall) 6 | - [Problem Proposed](#problem-proposed) 7 | - [Goal](#goal) 8 | - [Approach](#approach) 9 | - [Evaluation Methods in this Paper](#evaluation-methods-in-this-paper) 10 | - [Related Works Recommended for Further Reading](#related-works-recommended-for-further-reading) 11 | - [Detail](#detail) 12 | - [Chanllenges](#chanllenges) 13 | - [Proposed Solutions](#proposed-solutions) 14 | - [Reference](#reference) 15 | 16 | 17 | 18 | # Title 19 | 20 | [link](https://arxiv.org/pdf/1812.01329.pdf) 21 | 22 | ## My Takeaways and Some Thoughts 23 | 24 | ## Overall 25 | 26 | ### Problem Proposed 27 | 28 | ### Goal 29 | 30 | ### Approach 31 | 32 | ### Evaluation Methods in this Paper 33 | 34 | ## Related Works Recommended for Further Reading 35 | 36 | ## Detail 37 | 38 | ### Chanllenges 39 | 40 | ### Proposed Solutions 41 | 42 | ## Reference 43 | -------------------------------------------------------------------------------- /paper_notes/auto-diff/BP_and_implicit_function_theorem/README.md: -------------------------------------------------------------------------------- 1 | # Backpropagation is not Just the Chain-Rule 2 | 3 | ## Backprop and the Adjoint Method 4 | 5 | 6 | 7 | ## Reference 8 | 9 | 1. [Backprop is not just the chain rule](https://timvieira.github.io/blog/post/2017/08/18/backprop-is-not-just-the-chain-rule/) 10 | 1. [A new trick for calculating Jacobian vector products](https://j-towns.github.io/2017/06/12/A-new-trick.html) 11 | 1. [Mechanics of Lagrangians](http://www.argmin.net/2016/05/31/mechanics-of-lagrangians/) 12 | 1. [Mates of Costate](http://www.argmin.net/2016/05/18/mates-of-costate/) 13 | 1. [Black-box optimization](https://timvieira.github.io/blog/post/2018/03/16/black-box-optimization/) 14 | -------------------------------------------------------------------------------- /paper_notes/auto-diff/SCT_AD/README.md: -------------------------------------------------------------------------------- 1 | 1. [Tapenade](Tapenade.md) 2 | - Project webpage: https://www-sop.inria.fr/tropics/tapenade.html 3 | - Paper: [The Tapenade Automatic Differentiation tool: principles, model, and specification](https://hal.inria.fr/hal-00913983/document) 4 | - [Slides](http://www-sop.inria.fr/tropics/Laurent.Hascoet/slidesLesHouches.pdf) 5 | 1. [Myia](Automatic_Differentiation_in_Myia.md) 6 | - Github Project: https://github.com/mila-udem/myia 7 | - Paper: [Automatic Differentiation in Myia](https://openreview.net/pdf?id=S1hcluzAb) 8 | 1. [Tagent](Tagent.md) 9 | - Github Project: https://github.com/google/tangent 10 | - [Tangent: Source-to-Source Debuggable Derivatives](https://ai.googleblog.com/2017/11/tangent-source-to-source-debuggable.html) 11 | 1. [JAX](JAX.md) 12 | - Github Project: https://github.com/google/jax 13 | - [Compiling machine learning programs via high-level tracing](https://www.sysml.cc/doc/146.pdf) 14 | 1. [Zygote](Differentiating_SSA_form_program.md) 15 | - Github Project: https://github.com/FluxML/Zygote.jl 16 | - Paper: [Don't Unroll Adjoint: Differentiating SSA-Form Programs](https://arxiv.org/pdf/1810.07951.pdf) 17 | 18 | 1. [DLVM: A modern compiler infrastructure for deep learning systems with adjoint code generation in a domain-specific IR](https://arxiv.org/pdf/1711.03016.pdf) 19 | 20 | --- 21 | 22 | ### Some Related Research Work 23 | 24 | 1. [JANUS Fast and Flexible Deep Learning via Symbolic Graph Execution of Imperative Programs](JANUS.md) 25 | -------------------------------------------------------------------------------- /paper_notes/auto-diff/images/expression_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/auto-diff/images/expression_graph.png -------------------------------------------------------------------------------- /paper_notes/auto-diff/images/mix_forward_and_reverse_mode_AD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/auto-diff/images/mix_forward_and_reverse_mode_AD.png -------------------------------------------------------------------------------- /paper_notes/auto-diff/images/multidimensional_dual_numbers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/auto-diff/images/multidimensional_dual_numbers.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/CFG-optimizations/README.md: -------------------------------------------------------------------------------- 1 | # Reading List 2 | 3 | - [ ] [Optimizing control flow in loops using interval and dependence analysis](https://www.ics.uci.edu/~givargis/pubs/J18.pdf) 4 | - [ ] [Control Flow Analysis Dragon Book Section 8.4](http://www.cs.ecu.edu/karl/5220/spr16/Notes/Optimization/controlflow.html) 5 | - [ ] [Assignment 2: Control Flow Optimization](http://aggregate.org/OC/s18a2.html) 6 | - [ ] [Lecture 4: Control Flow Optimization, COS 598C - Advanced Compilers](https://www.cs.princeton.edu/courses/archive/spr04/cos598C/lectures/04-ControlFlow-3x1.pdf) 7 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/CFG-optimizations/images/discussion-on-cfg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/CFG-optimizations/images/discussion-on-cfg.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/dmxpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/dmxpy.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/dmxpy.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/dmxpy.pptx -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/excerpt-from-dmxpy-in-linpack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/excerpt-from-dmxpy-in-linpack.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/manual-optimizations-for-dmxpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/manual-optimizations-for-dmxpy.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/simple-version-of-dmxpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/simple-version-of-dmxpy.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/images/strength-reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Code-optimizations/images/strength-reduction.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Code-optimizations/local-optimizatioins.md: -------------------------------------------------------------------------------- 1 | # Local Optimizations 2 | 3 | Programmers will protest that they do not write code that contains redundant. In practice, redundancy elimination finds many opportunities. ***Translation from source code to ir elaborates many details, such as address calculations, and introduces redundant expressions***. 4 | 5 | ## Local Value Numbering (LVN) 6 | 7 | ***Local value numbering*** is one of the oldest and most powerful redundency elimination. 8 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Compiler_and_Interpreter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Compiler_and_Interpreter.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Dependence_analysis/README.md: -------------------------------------------------------------------------------- 1 | TBD 2 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Dependence_analysis/dependence_abstraction/contents.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Dependence_analysis/dependence_abstraction/contents.tex -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Dependence_analysis/dependence_abstraction/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Dependence Abstraction} 5 | \author{Ying Cao} 6 | \date{\today} 7 | 8 | \begin{document} 9 | 10 | \maketitle 11 | \tableofcontents 12 | \newpage 13 | \input{contents.tex} 14 | 15 | { 16 | \small 17 | \raggedright 18 | \bibliographystyle{ieeetr} 19 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 20 | \begin{spacing}{1} 21 | \bibliography{references.bib} 22 | \end{spacing} 23 | } 24 | \end{document} 25 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Dependence_analysis/dependence_abstraction/references.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Dependence_analysis/dependence_abstraction/references.bib -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/AST-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/AST-example.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/DAG-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/DAG-example.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/IR-level-of-abstraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/IR-level-of-abstraction.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/an-simple-example-of-dependency-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/an-simple-example-of-dependency-graph.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/dependence-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/dependence-graph.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/different-levels-of-abstraction-for-an-array-subscript-reference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/different-levels-of-abstraction-for-an-array-subscript-reference.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/naming-leads-to-different-translations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/naming-leads-to-different-translations.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/one-address-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/one-address-code.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Intermediate-Representations/images/three-address-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Intermediate-Representations/images/three-address-code.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/Terminology.md: -------------------------------------------------------------------------------- 1 | # Schedule 2 | 3 | # Polyhedra/Polyhedron 4 | 5 | # Dependence system 6 | 7 | # Lexicographic order 8 | 9 | If $R_1$ is a definition and $R_2$ a use, lexico-positive points of $\Pi$ correspond to data dependences from $S_1$ to $S_2$ (from write to read), while lexico-negative points correspond to anti dependences from $S_2$ to $S_1$ (from read to write). 10 | 11 | # Legality of Unimodular Transformations 12 | 13 | # Fully permutabiliy 14 | 15 | # Wavefront transformation 16 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/a_single_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/a_single_rnn.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/bi_directional_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/bi_directional_rnn.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/bi_directional_rnn.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/bi_directional_rnn.pptx -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/different_dependences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/different_dependences.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/dilated_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/dilated_rnn.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/example-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/example-01.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/grid_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/grid_rnn.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/polyhedral_representation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/polyhedral_representation.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/stack_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/stack_rnn.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/wh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/wh.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/Basics/images/workflow.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/dependence_analysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/Denpendence_analysis/dependence_analysis.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Polyhedral_representation_in_Pet/references.bib: -------------------------------------------------------------------------------- 1 | @article{lamport1974parallel, 2 | title={The parallel execution of do loops}, 3 | author={Lamport, Leslie}, 4 | journal={Communications of the ACM}, 5 | volume={17}, 6 | number={2}, 7 | pages={83--93}, 8 | year={1974}, 9 | publisher={ACM New York, NY, USA} 10 | } 11 | @article{wolf1991loop, 12 | title={A loop transformation theory and an algorithm to maximize parallelism}, 13 | author={Wolf, Michael E and Lam, Monica S}, 14 | journal={IEEE Transactions on Parallel \& Distributed Systems}, 15 | number={4}, 16 | pages={452--471}, 17 | year={1991}, 18 | publisher={IEEE} 19 | } 20 | @book{banerjee2013loop, 21 | title={Loop parallelization}, 22 | author={Banerjee, Utpal}, 23 | year={2013}, 24 | publisher={Springer Science \& Business Media} 25 | } 26 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Polyhedral_representation_in_Pet/section1.tex: -------------------------------------------------------------------------------- 1 | \begin{frame}{Definitions and Notations in isl} 2 | \begin{block}{Named Integer Tuples} 3 | A \textit{named integer tuple} consists of an identifier (name) and a sequence of integer values. 4 | The identifier may be omitted and the sequence of integers may have a zero length. 5 | \end{block} 6 | 7 | \item \textbf{Notation}: 8 | 9 | \end{frame} 10 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/Polyhedral_representation_in_Pet/slides.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,aspectratio=43,mathserif]{beamer} 2 | 3 | \input{structure.tex} 4 | 5 | \title[] {Represent a Polyhedral Model using \textit{isl}} 6 | \author{Ying Cao} 7 | \institute {} 8 | \date{\today} 9 | 10 | \begin {document} 11 | % \begin{changemargin}{0.2cm}{0.2cm} 12 | 13 | \begin {frame} 14 | \titlepage 15 | \end {frame} 16 | 17 | % \AtBeginSection[] 18 | % { 19 | % \begin{frame} 20 | % \frametitle{Outlines} 21 | % \tableofcontents[ 22 | % %currentsection, 23 | % % currentsubsection, 24 | % hideallsubsections, 25 | % sectionstyle= hide, % show 26 | % % subsectionstyle=shaded % show 27 | % ] 28 | % \end{frame} 29 | % } 30 | 31 | \section{Definitions and notations in isl} 32 | \input{section1.tex} 33 | 34 | % \section{Issues, Next Plans} 35 | % \input{plans.tex} 36 | 37 | \section{References} 38 | \begin{frame}[allowframebreaks]{References} 39 | \bibliographystyle{ieeetr} 40 | \bibliography{references.bib} 41 | \end{frame} 42 | 43 | % \end{changemargin} 44 | \end {document} 45 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/The_parallel_execution_of_do_loops.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/The_parallel_execution_of_do_loops.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Loop_analysis/auto-vectorization/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Loop_analysis/auto-vectorization/README.md -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/A_Performance_Vocabulary_for_Affine_Loop_Transformations/contents.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/A_Performance_Vocabulary_for_Affine_Loop_Transformations/contents.tex -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/A_Performance_Vocabulary_for_Affine_Loop_Transformations/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Note for 5 | \textit{\href{https://arxiv.org/pdf/1811.06043.pdf}{A performance vocabulary for affine loop transformations}}} 6 | \author{Ying Cao} 7 | \date{\today} 8 | 9 | \begin{document} 10 | 11 | \maketitle 12 | \tableofcontents 13 | \newpage 14 | \input{contents.tex} 15 | 16 | { 17 | \small 18 | \raggedright 19 | \bibliographystyle{ieeetr} 20 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 21 | \begin{spacing}{1} 22 | \bibliography{references.bib} 23 | \end{spacing} 24 | } 25 | \end{document} 26 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/A_Performance_Vocabulary_for_Affine_Loop_Transformations/references.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/A_Performance_Vocabulary_for_Affine_Loop_Transformations/references.bib -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Data_dependence_and_PIP/contents.tex: -------------------------------------------------------------------------------- 1 | \section{Formulation of the data dependence problem} 2 | 3 | \section{PIP and its output} 4 | 5 | \section{What can we do when we have complete and accurate data dependence?} 6 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Data_dependence_and_PIP/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Data dependence and PIP} 5 | \author{Ying Cao} 6 | \date{\today} 7 | 8 | \begin{document} 9 | 10 | \maketitle 11 | \tableofcontents 12 | \newpage 13 | \input{contents.tex} 14 | 15 | { 16 | \small 17 | \raggedright 18 | \bibliographystyle{ieeetr} 19 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 20 | \begin{spacing}{1} 21 | \bibliography{references.bib} 22 | \end{spacing} 23 | } 24 | \end{document} 25 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Data_dependence_and_PIP/references.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/Data_dependence_and_PIP/references.bib -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Mathmatical_foundations/concepts.md: -------------------------------------------------------------------------------- 1 | # [Paritally Ordered Set]() 2 | 3 | A _**partially ordered set**_ (_**also poset**_) formalizes and generalizes the intuitive concept of an _ordering_, _sequencing_, or _arrangement_ of the elements of a set. A poset consists of _**a set together with a binary relation**_ indicating that, for certain pairs of elements in the set, one of the elements precedes the other in the ordering. The _**relation itself is called a "partial order"**_. 4 | 5 | The word partial in the names "partial order" and "partially ordered set" is used as an indication that _**not every pair of elements needs to be comparable**_. That is, there may be pairs of elements for which neither element precedes the other in the poset. Partial orders thus generalize total orders, in which every pair is comparable. 6 | 7 | # [Lattice](https://en.wikipedia.org/wiki/Lattice_(order)) 8 | 9 | A lattice consists of a partially ordered set in which every two elements have a unique supremum (also called a least upper bound or join) and a unique infimum (also called a greatest lower bound or meet). 10 | 11 | An example is given by the natural numbers, partially ordered by divisibility, for which the unique supremum is the least common multiple and the unique infimum is the greatest common divisor. 12 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/SCoP_decomposition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/SCoP_decomposition.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/a_skewing_transformation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/a_skewing_transformation.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/farkas_lemma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/images/farkas_lemma.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/main.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Note for 5 | \textit{\href{https://hal.inria.fr/inria-00001056/document}{More Legal Transformations for Locality}}} 6 | \author{Ying Cao} 7 | \date{\today} 8 | 9 | \begin{document} 10 | 11 | \maketitle 12 | \tableofcontents 13 | \newpage 14 | \input{contents.tex} 15 | 16 | { 17 | \small 18 | \raggedright 19 | \bibliographystyle{ieeetr} 20 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 21 | \begin{spacing}{1} 22 | \bibliography{references.bib} 23 | \end{spacing} 24 | } 25 | \end{document} 26 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/More_Legal_Transformations_for_Locality/references.bib: -------------------------------------------------------------------------------- 1 | @Misc{Bas12, 2 | author={Bastoul, C\'{e}dric}, 3 | title={\textit{Contributions to High-Level Program Optimization}. {H}abilitation {T}hesis. {P}aris-{S}ud {U}niversity, {F}rance}, 4 | month=dec, 5 | year=2012, 6 | } 7 | 8 | @PhdThesis{TBas, 9 | author={Bastoul, C\'{e}dric}, 10 | title={Improving Data Locality in Static Control Programs}, 11 | school={University Paris 6, Pierre et Marie Curie, France}, 12 | month=dec, 13 | year=2004, 14 | } 15 | 16 | @article{xue1997tiling, 17 | title={\href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.26.9719&rep=rep1&type=pdf}{On tiling as a loop transformation}}, 18 | author={Xue, Jingling}, 19 | journal={Parallel Processing Letters}, 20 | volume={7}, 21 | number={04}, 22 | pages={409--424}, 23 | year={1997}, 24 | publisher={World Scientific} 25 | } 26 | @inproceedings{griebl1998code, 27 | title={\href{https://www.infosun.fim.uni-passau.de/publications/docs/GLW98pact.pdf}{Code generation in the polytope model}}, 28 | author={Griebl, Martin and Lengauer, Christian and Wetzel, Sabine}, 29 | booktitle={Proceedings. 1998 International Conference on Parallel Architectures and Compilation Techniques (Cat. No. 98EX192)}, 30 | pages={106--111}, 31 | year={1998}, 32 | organization={IEEE} 33 | } 34 | @inproceedings{lengauer1993loop, 35 | title={\href{http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.29.8716&rep=rep1&type=pdf}{Loop parallelization in the polytope model}}, 36 | author={Lengauer, Christian}, 37 | booktitle={International Conference on Concurrency Theory}, 38 | pages={398--416}, 39 | year={1993}, 40 | organization={Springer} 41 | } 42 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Polyhedral_representation/2d_puls_1.tex: -------------------------------------------------------------------------------- 1 | \section{the $2\mathbf{d}+1$ representation} 2 | 3 | Refer to paper \cite{girbal2006semi}\cite{vasilache2007scalable}\cite{bastoul2011openscop} for details. 4 | 5 | \subsection {Classical loop transformations} 6 | 7 | \begin{enumerate} 8 | \item only modif the iteration domain but do not affact the order in which 9 | statement instances are executed or the way arrays are accessed; 10 | \begin{enumerate} 11 | \item loop unrolling 12 | \item strip-mining 13 | \end{enumerate} 14 | \item modifies both iteration domain and a schedule transformation; 15 | \begin{enumerate} 16 | \item tiling: a combination of strip-mining and loop interchange; 17 | \end{enumerate} 18 | \item modifies schedule; 19 | \begin{enumerate} 20 | \item shifting/pipelining 21 | \end{enumerate} 22 | \item modifies array subscripts 23 | \begin{enumerate} 24 | \item privatization 25 | \end{enumerate} 26 | \item only modifies the array declarations (data layout) 27 | \begin{enumerate} 28 | \item padding 29 | \end{enumerate} 30 | \end{enumerate} 31 | 32 | \subsection{Polyhedral model} 33 | 34 | The polyhedral representation is a semantics-based representation instead of 35 | syntax-based representation. It clearly separates the four different types of 36 | actions performed by program transformations: 37 | 38 | \begin{enumerate} 39 | \item modification of the iteration domain (loop bounds and strides); 40 | \item modification of the schedule of each individual statement; 41 | \item modification of access functions (array subscripts) 42 | \item modification of the data layout (array declarations) 43 | \end{enumerate} 44 | 45 | Loop transformations are expressed as a "syntax-free" function compositions. 46 | 47 | Aribitrarily complex compositions of classical transformations can be captured 48 | in one single transformation step of the polyhedral model. 49 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Polyhedral_representation/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{\href{https://link.springer.com/content/pdf/10.1007/s10766-006-0012-3.pdf}{Semi-Automatic Composition of Loop Transformations for Deep Parallelism and Memory Hierarchies}} 5 | \author{Ying Cao} 6 | \date{\today} 7 | 8 | \begin{document} 9 | 10 | \maketitle 11 | \tableofcontents 12 | \newpage 13 | \input{2d_puls_1.tex} 14 | \input{schedule_tree.tex} 15 | 16 | { 17 | \small 18 | \raggedright 19 | \bibliographystyle{ieeetr} 20 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 21 | \begin{spacing}{1} 22 | \bibliography{references.bib} 23 | \end{spacing} 24 | } 25 | \end{document} 26 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Polyhedral_representation/schedule_tree.tex: -------------------------------------------------------------------------------- 1 | \section{Schedule Trees} 2 | Refer to paper \cite{grosser2014decoupled} \cite{verdoolaege2014schedule} for details. 3 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/README.md: -------------------------------------------------------------------------------- 1 | # Polyhderal Compilation 2 | 3 | ## Polyhedral Program Representation 4 | 5 | 1. Bastoul, Cédric, et al. "[Putting polyhedral loop transformations to work](https://hal.inria.fr/file/index/docid/71681/filename/RR-4902.pdf)." International Workshop on Languages and Compilers for Parallel Computing. Springer, Berlin, Heidelberg, 2003. 6 | - Clan project: http://icps.u-strasbg.fr/~bastoul/development/clan/#DL 7 | - [Clan - a polyhedral representation extractor for high level programs](http://icps.u-strasbg.fr/~bastoul/development/clan/docs/clan.html) 8 | 9 | >_This is the paper for Clan, a tool to extract polyhedral program representation from C programs._ 10 | > 11 | >_This paper answers the question of what a polyhedral program representation look like and helps get quick understandings and intuitions of basic concepts of polyhedral compilation and its workflow._ 12 | > 13 | >_The paper goes through basic concepts, the whole workflow with polyhedral program representation a focus, and inputs, outputs, requirements and formulations for each step without digging into whys and hows._ 14 | 15 | 1. Verdoolaege, Sven, and Tobias Grosser. "[Polyhedral extraction tool](https://www.grosser.es/publications/grosser-2012-Polyhedral-Extraction-Tool-IMPACT.pdf)." Second International Workshop on Polyhedral Compilation Techniques (IMPACT’12), Paris, France. 2012. 16 | - Pet project: https://github.com/Meinersbur/pet 17 | - The doctoral thesis of Pet's author. Chapter 9 of: Grosser, Tobias. [A decoupled approach to high-level loop optimization: tile shapes, polyhedral building blocks and low-level compilers](https://tel.archives-ouvertes.fr/tel-01144563/document). Diss. 2014. 18 | 19 | 1. [Polyhedral Process Networks](https://www.semanticscholar.org/paper/Polyhedral-Process-Networks-Verdoolaege/e8f64c573a680cddb6ede148c1778b94afb70830) 20 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/Tiramisu/tiramisu_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/Tiramisu/tiramisu_overview.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-01.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-02.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-03.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/images/introduction-04.png -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/isl/contents.tex: -------------------------------------------------------------------------------- 1 | Refer to this tutorial\cite{verdoolaege2016presburger} for the first-hand knowledge. 2 | 3 | Concepts are highlighted in \textcolor{vr}{violet red}, and its corresponding isl implementation is highlighted in \textcolor{og}{oliver green}. 4 | 5 | \input{sets_and_maps.tex} 6 | \input{presburger_sets_and_relations.tex} 7 | \input{pw_quasi_affine.tex} 8 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/isl/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/isl/main.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/isl/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{\textit{isl} Basics} 5 | \author{Ying Cao} 6 | \date{\today} 7 | 8 | \begin{document} 9 | 10 | \maketitle 11 | \tableofcontents 12 | \newpage 13 | \input{contents.tex} 14 | 15 | { 16 | \small 17 | \raggedright 18 | \bibliographystyle{ieeetr} 19 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 20 | \begin{spacing}{1} 21 | \bibliography{references.bib} 22 | \end{spacing} 23 | } 24 | \end{document} 25 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/isl/references.bib: -------------------------------------------------------------------------------- 1 | @article{verdoolaege2016presburger, 2 | title={\href{https://lirias.kuleuven.be/retrieve/361209}{Presburger formulas and polyhedral compilation}}, 3 | author={Verdoolaege, Sven}, 4 | year={2016} 5 | } 6 | -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_01.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_02.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/Polyhedral_compilatioin/polyhedral_background_03.pdf -------------------------------------------------------------------------------- /paper_notes/compiler-stuffs/intermediate-code-generation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/compiler-stuffs/intermediate-code-generation.md -------------------------------------------------------------------------------- /paper_notes/data_processing_systems/README.md: -------------------------------------------------------------------------------- 1 | 1. Murray D G, Schwarzkopf M, Smowton C, et al. [Ciel: a universal execution engine for distributed data-flow computing](https://web.kaust.edu.sa/Faculty/MarcoCanini/classes/CS345/S19/papers/ciel.pdf)[C]//Proc. 8th ACM/USENIX Symposium on Networked Systems Design and Implementation. 2011: 113-126. 2 | 1. Malewicz, Grzegorz, et al. "[Pregel: a system for large-scale graph processing](https://www.researchgate.net/profile/James-Dehnert/publication/221257383_Pregel_A_system_for_large-scale_graph_processing/links/00b7d537c615821fa4000000/Pregel-A-system-for-large-scale-graph-processing.pdf)." Proceedings of the 2010 ACM SIGMOD International Conference on Management of data. 2010. 3 | 1. Rocklin M. [Dask: Parallel computation with blocked algorithms and task scheduling](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.825.5314&rep=rep1&type=pdf)[C]//Proceedings of the 14th python in science conference. Austin, TX: SciPy, 2015, 126. 4 | 1. Power R, Li J. [Piccolo: Building Fast, Distributed Programs with Partitioned Tables](https://static.usenix.org/events/osdi10/tech/full_papers/Power.pdf)[C]//OSDI. 2010, 10: 293-306. 5 | -------------------------------------------------------------------------------- /paper_notes/data_processing_systems/figures/ciel_cluster_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/data_processing_systems/figures/ciel_cluster_architecture.png -------------------------------------------------------------------------------- /paper_notes/data_processing_systems/figures/dynamic-task-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/data_processing_systems/figures/dynamic-task-graph.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/README.md: -------------------------------------------------------------------------------- 1 | # Reading List 2 | 3 | ## Dataflow architecutre 4 | 5 | - [ ] [Timely Dataflow: A model](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43546.pdf) 6 | - [ ] [Dataflow computers: their history and future](https://csrl.cse.unt.edu/kavi/Research/encyclopedia-dataflow.pdf) 7 | - [ ] [Reducing control overhead in dataflow architectures](http://arcade.cs.columbia.edu/ws-pact06.pdf) 8 | 9 | ## Scheduled Dataflow 10 | 11 | - [ ] [Scheduled dataflow: execution paradigm, architecture, and performance evaluation](https://www.researchgate.net/profile/Roberto_Giorgi/publication/3044386_Scheduled_Dataflow_Execution_paradigm_architecture_and_performance_evaluation/links/0912f50c049bbceb3c000000/Scheduled-Dataflow-Execution-paradigm-architecture-and-performance-evaluation.pdf?origin=publication_detail) 12 | 13 | ## Some slides 14 | 15 | - [ ] [Dataflow architectures](https://homes.cs.washington.edu/~kstrauss/presentations/df-class.pdf) 16 | - [ ] [Computer Architecture: Dataflow (Part I)](https://www.archive.ece.cmu.edu/~ece740/f13/lib/exe/fetch.php?media=onur-740-fall13-module5.2.1-dataflow-part1.pdf) 17 | 18 | ## Miscellanea 19 | 20 | 1. [mueller's publicatioins](https://arcb.csc.ncsu.edu/~mueller/publications.html#mueller91) 21 | 1. [ECE 4530 Hardware/Software Codesign](https://schaumont.dyn.wpi.edu/ece4530f19/) 22 | 1. [dMazeRunner: Executing Perfectly Nested Loops on Dataflow Accelerators](https://dl.acm.org/doi/pdf/10.1145/3358198) 23 | -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/Manchester-dynamic-dataflow-machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/Manchester-dynamic-dataflow-machine.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/dataflow-accumulator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/dataflow-accumulator.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/dataflow-graph-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/dataflow-graph-1.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/ill-formed_multi-rate-dataflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/ill-formed_multi-rate-dataflow.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/img1.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/img2.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/paper-screenshot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/paper-screenshot-1.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/periodic_admissible_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/periodic_admissible_schedule.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/solve_G.q.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/solve_G.q.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/static-dataflow-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/static-dataflow-architecture.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/topology_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/topology_matrix.png -------------------------------------------------------------------------------- /paper_notes/dataflow-architectures/images/two-input_add_actor_and_two-output_duplicate_actor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dataflow-architectures/images/two-input_add_actor_and_two-output_duplicate_actor.png -------------------------------------------------------------------------------- /paper_notes/dl-compiler/Glow/Glow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-compiler/Glow/Glow.pdf -------------------------------------------------------------------------------- /paper_notes/dl-compiler/Glow/images/low-level-glow-ir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-compiler/Glow/images/low-level-glow-ir.png -------------------------------------------------------------------------------- /paper_notes/dl-compiler/MLIR/MLIR.md: -------------------------------------------------------------------------------- 1 | # MLIR 2 | 3 | "flow-sensitive" type checking. 4 | 5 | 6 | # Reference 7 | 8 | 1. [Multi-Level Intermediate Representation Overview](https://github.com/tensorflow/mlir#multi-level-intermediate-representation-overview) 9 | 1. [MLIR Tutorial: Building a Compiler with MLIR](https://llvm.org/devmtg/2019-04/slides/Tutorial-AminiVasilacheZinenko-MLIR.pdf) 10 | 1. [A discussion about MLIR from the TVM community](https://discuss.tvm.ai/t/google-lasted-work-mlir-primer/1721/2). 11 | 1. [A Reddit discussion about MLIR](https://www.reddit.com/r/ProgrammingLanguages/comments/at0alm/mlir_primer_a_compiler_infrastructure_for_the_end/) 12 | 1. [2019 EuroLLVM Developers’ Meeting: “MLIR: Multi-Level Intermediate Representation”](https://www.youtube.com/watch?v=qzljG6DKgic&feature=youtu.be) 13 | -------------------------------------------------------------------------------- /paper_notes/dl-compiler/MLIR/README.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | 1. [The github project](https://github.com/tensorflow/mlir) 4 | 1. Tow vedios on Youtube 5 | 1. [2019 EuroLLVM Developers’ Meeting: Mehdi & Vasilache & Zinenko “Building a Compiler with MLIR”](https://www.youtube.com/watch?v=cyICUIZ56wQ) 6 | -------------------------------------------------------------------------------- /paper_notes/dl-compiler/MLIR/swift_for_tensorflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-compiler/MLIR/swift_for_tensorflow.pdf -------------------------------------------------------------------------------- /paper_notes/dl-compiler/TVM/TVM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-compiler/TVM/TVM.pdf -------------------------------------------------------------------------------- /paper_notes/dl-compiler/XLA/XLA.md: -------------------------------------------------------------------------------- 1 | # XLA: The TensorFlow compiler framework 2 | 3 | * [TensorFlow, Compiled!](https://autodiff-workshop.github.io/slides/JeffDean.pdf) 4 | 5 | ## Goals 6 | 7 | 1. Improved execution speed. 8 | 1. Improved tensor buffer memory usage. 9 | 1. Make the performance of low-level Ops be the same as that of hand-written fused implementations. 10 | 1. Improved mobile footprint. Eliminate the TensorFlow runtime. 11 | 1. Improved protability. 12 | * It should be relatively easy to write a new back-end for novel hardware. 13 | 14 | ## XLA 15 | 16 | * The semantics of operations are _**high level**_. This preserves enough information to allow sophisticated scheduling and optimization. 17 | 18 |  19 | 20 | * XLA program = static, decomposed TF ops 21 | * math-looking _**primitive ops**_ 22 | * make _**macro-ops by composition**_ 23 | 24 | ### A key question: why write every new macro-op? 25 | 26 | * Why write every new macro-op in C++? 27 | * Why can't compose new operators out of existing TF ops? 28 | 29 | ### Compliation benefits 30 | 31 | 1. Eliminates op dispatch overhead. 32 | 1. Fuses ops. 33 | * reduce memory access 34 | 1. Memory usage analysis 35 | * reuse memory 36 | * update in-place 37 | 1. Models to executables: reduce executable size by generating what you need. 38 | -------------------------------------------------------------------------------- /paper_notes/dl-compiler/figures/sm-and-sub-core-of-volta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-compiler/figures/sm-and-sub-core-of-volta.png -------------------------------------------------------------------------------- /paper_notes/dl-models/beyond_transformer/README.md: -------------------------------------------------------------------------------- 1 | 1. [VanillaNet: the Power of Minimalism in 2 | Deep Learning](https://arxiv.org/pdf/2305.12972.pdf) 3 | 1. [RWKV: Reinventing RNNs for the Transformer Era](https://arxiv.org/abs/2305.13048) 4 | -------------------------------------------------------------------------------- /paper_notes/dl-models/miscellanea/Geometric_deep_learning.md: -------------------------------------------------------------------------------- 1 | # [Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges](https://arxiv.org/pdf/2104.13478.pdf) 2 | -------------------------------------------------------------------------------- /paper_notes/dl-models/miscellanea/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/miscellanea/README.md -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/CW-RNN/A_Clockwork_RNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/CW-RNN/A_Clockwork_RNN.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/GridLSTM/GridLSTM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/GridLSTM/GridLSTM.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/HM-LSTM/Hierarchical_multiscale_RNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/HM-LSTM/Hierarchical_multiscale_RNN.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/How_Much_Attention_Do_You_Need.md: -------------------------------------------------------------------------------- 1 | # [How Much Attention Do you need](http://aclweb.org/anthology/P18-1167) 2 | 3 | 1. The performance of recurrent and convolutional models can be very close to the Transformer performance by borrowing concepts from the Transformer architecture, but not using self-attention. 4 | 1. Self-attention is much more important for the encoder side than for the decoder side. 5 | * In the encoder side, self-attention can be replaced by a RNN or CNN without a loss in performance in most settings. 6 | * One surpising experimental result is even a model without any target side self-attention performs well. 7 | 1. Source attention on lower encoder layers brings no additional benefit. 8 | 1. The largest gains come from multiple attention mechanisms and residual feed-forward layers. 9 | -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/MD-LSTM/MD-LSTM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/MD-LSTM/MD-LSTM.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/images/MogrifierLSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/images/MogrifierLSTM.png -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/main.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass {article} 2 | 3 | \input{structure.tex} % Include the file specifying the document structure and custom commands 4 | \title{Note for 5 | \textit{\href{https://arxiv.org/abs/1909.01792}{Mogrifier LSTM}}} 6 | \author{Ying Cao} 7 | \date{\today} 8 | 9 | \begin{document} 10 | 11 | \maketitle 12 | \tableofcontents 13 | 14 | \begin{info}[Codes information] 15 | 16 | \begin{itemize} 17 | \item Currently, the authors of this paper only release their \href{https://github.com/RMichaelSwan/MogrifierLSTM}{experimental codes} 18 | on the github. 19 | \item The final codes are not released yet. When the codes is available, it should 20 | be at \href{https://github.com/deepmind/lamb}{https://github.com/deepmind/lamb}. 21 | \end{itemize} 22 | \end{info} 23 | 24 | \input{contents.tex} 25 | 26 | { 27 | \small 28 | \raggedright 29 | \bibliographystyle{ieeetr} 30 | % or, abbrv, acm, alpha, apalike, ieeetr, plain, siam, unsrt 31 | \begin{spacing}{1} 32 | \bibliography{references.bib} 33 | \end{spacing} 34 | } 35 | \end{document} 36 | -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Mogrifier-LSTM/references.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{foerster2017input, 2 | title={Input switched affine networks: an RNN architecture designed for interpretability}, 3 | author={Foerster, Jakob N and Gilmer, Justin and Sohl-Dickstein, Jascha and Chorowski, Jan and Sussillo, David}, 4 | booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70}, 5 | pages={1136--1145}, 6 | year={2017}, 7 | organization={JMLR. org} 8 | } 9 | @article{ha2016hypernetworks, 10 | title={Hypernetworks}, 11 | author={Ha, David and Dai, Andrew and Le, Quoc V}, 12 | journal={arXiv preprint arXiv:1609.09106}, 13 | year={2016} 14 | } 15 | @article{krause2016multiplicative, 16 | title={Multiplicative LSTM for sequence modelling}, 17 | author={Krause, Ben and Lu, Liang and Murray, Iain and Renals, Steve}, 18 | journal={arXiv preprint arXiv:1609.07959}, 19 | year={2016} 20 | } 21 | -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Neural_Speed_Reading_via_Skim_RNN/Neural_Speed_Reading_via_Skim_RNN.md: -------------------------------------------------------------------------------- 1 | [The Gumbel-Max Trick for Discrete Distributions](https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/) 2 | 3 | [Categorical Variational Autoencoders using Gumbel-Softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) 4 | -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/ON-LSTM/ON-LSTM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/ON-LSTM/ON-LSTM.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Quasi-Recurrent_neural_network/Quasi-Recurrent_neural_network.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/Quasi-Recurrent_neural_network/Quasi-Recurrent_neural_network.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/RNN_Variants.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/RNN_Variants.pdf -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/images/CudnnLSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/images/CudnnLSTM.png -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/images/Recurrent_neural_network_unfold.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcy-seso/LearningNotes/6b63c89e2580ae1503dee7a0b0d3456566281334/paper_notes/dl-models/nlp/RNN-modeling/RNN_Variants_Slides_190820/images/Recurrent_neural_network_unfold.svg.png -------------------------------------------------------------------------------- /paper_notes/dl-models/nlp/RNN-modeling/Sliced_Recurrent_Neural_Networks/Sliced_Recurrent_Neural_Networks.md: -------------------------------------------------------------------------------- 1 | # [Sliced Recurrent Neural Networks](https://arxiv.org/abs/1807.02291) 2 | 3 | ## Model Structure 4 | 5 | 2 hyperparameters of SRNN 6 | 7 | 1. slice number $n$ 8 | 1. slicing times $k$ 9 | 10 | The input sequence is $X = [x_1, x_2, ..., x_t]$ whose length is $T$. 11 | 12 | 1. Slice $X$ into $n$ subsequences of equal length. 13 | 1. Repeat the above process $k$ times until a pre-defined minimum sequence length is obtained. 14 | 1. Apply RNN function to each subsequence. 15 | 16 |
17 |
18 |
2 |
3 | Fig. The original GPT model. (from [wiki page](https://en.wikipedia.org/wiki/Generative_pre-trained_transformer))
4 |
8 |
9 |
17 |
18 |
4 |
5 |
8 |
9 |
10 |
11 |
12 |
19 |
20 |
21 |
6 |
7 |