├── .gitignore
├── README.md
├── section1_token_level_generation
    ├── LICENSE.md
    ├── README.md
    ├── constrained_decoding.ipynb
    └── prompt_boundaries_and_token_healing.ipynb
├── section2_metageneration
    ├── README.md
    ├── mbpp
    │   ├── mbpp_demo.ipynb
    │   └── mbpp_utils.py
    └── treefinement
    │   ├── treefinement-background.png
    │   ├── treefinement.ipynb
    │   ├── treefinement_vs_parallel.png
    │   └── utils.py
└── section3
    ├── README.md
    ├── Speculative_Decoding_Demo.ipynb
    ├── __init__.py
    ├── speculative_decoding_utils.py
    ├── speculative_decoding_vllm_benchmark.py
    ├── vllm_benchmark
        ├── Avg_Latency.png
        ├── Draft_Acceptance_Rate.png
        ├── Scoring_Time.png
        ├── System_Efficiency.png
        ├── Time_Per_Proposal.png
        └── Verification_Time.png
    ├── vllm_benchmark_h100
        ├── Average Time per Proposal Token vs Number of Speculative Tokens.png
        ├── Draft Acceptance Rate vs Number of Speculative Tokens.png
        ├── Runtime vs Number of Speculative Tokens.png
        ├── Scoring Time vs Number of Speculative Tokens.png
        ├── System Efficiency vs Number of Speculative Tokens.png
        ├── Verification Time vs Number of Speculative Tokens.png
        ├── easy_metrics_gsm.csv
        └── hard_metrics_gsm.csv
    └── vllm_benchmark_h100_wbase
        ├── Average Time per Proposal Token vs Number of Speculative Tokens.png
        ├── Draft Acceptance Rate vs Number of Speculative Tokens.png
        ├── Runtime vs Number of Speculative Tokens.png
        ├── Scoring Time vs Number of Speculative Tokens.png
        ├── System Efficiency vs Number of Speculative Tokens.png
        ├── Verification Time vs Number of Speculative Tokens.png
        ├── easy_metrics_gsm.csv
        └── hard_metrics_gsm.csv


/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/.gitignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/README.md


--------------------------------------------------------------------------------
/section1_token_level_generation/LICENSE.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section1_token_level_generation/LICENSE.md


--------------------------------------------------------------------------------
/section1_token_level_generation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section1_token_level_generation/README.md


--------------------------------------------------------------------------------
/section1_token_level_generation/constrained_decoding.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section1_token_level_generation/constrained_decoding.ipynb


--------------------------------------------------------------------------------
/section1_token_level_generation/prompt_boundaries_and_token_healing.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section1_token_level_generation/prompt_boundaries_and_token_healing.ipynb


--------------------------------------------------------------------------------
/section2_metageneration/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/README.md


--------------------------------------------------------------------------------
/section2_metageneration/mbpp/mbpp_demo.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/mbpp/mbpp_demo.ipynb


--------------------------------------------------------------------------------
/section2_metageneration/mbpp/mbpp_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/mbpp/mbpp_utils.py


--------------------------------------------------------------------------------
/section2_metageneration/treefinement/treefinement-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/treefinement/treefinement-background.png


--------------------------------------------------------------------------------
/section2_metageneration/treefinement/treefinement.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/treefinement/treefinement.ipynb


--------------------------------------------------------------------------------
/section2_metageneration/treefinement/treefinement_vs_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/treefinement/treefinement_vs_parallel.png


--------------------------------------------------------------------------------
/section2_metageneration/treefinement/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section2_metageneration/treefinement/utils.py


--------------------------------------------------------------------------------
/section3/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/README.md


--------------------------------------------------------------------------------
/section3/Speculative_Decoding_Demo.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/Speculative_Decoding_Demo.ipynb


--------------------------------------------------------------------------------
/section3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/section3/speculative_decoding_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/speculative_decoding_utils.py


--------------------------------------------------------------------------------
/section3/speculative_decoding_vllm_benchmark.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/speculative_decoding_vllm_benchmark.py


--------------------------------------------------------------------------------
/section3/vllm_benchmark/Avg_Latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/Avg_Latency.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark/Draft_Acceptance_Rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/Draft_Acceptance_Rate.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark/Scoring_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/Scoring_Time.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark/System_Efficiency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/System_Efficiency.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark/Time_Per_Proposal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/Time_Per_Proposal.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark/Verification_Time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark/Verification_Time.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/Average Time per Proposal Token vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/Average Time per Proposal Token vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/Draft Acceptance Rate vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/Draft Acceptance Rate vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/Runtime vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/Runtime vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/Scoring Time vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/Scoring Time vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/System Efficiency vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/System Efficiency vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/Verification Time vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/Verification Time vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/easy_metrics_gsm.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/easy_metrics_gsm.csv


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100/hard_metrics_gsm.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100/hard_metrics_gsm.csv


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/Average Time per Proposal Token vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/Average Time per Proposal Token vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/Draft Acceptance Rate vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/Draft Acceptance Rate vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/Runtime vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/Runtime vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/Scoring Time vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/Scoring Time vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/System Efficiency vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/System Efficiency vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/Verification Time vs Number of Speculative Tokens.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/Verification Time vs Number of Speculative Tokens.png


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/easy_metrics_gsm.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/easy_metrics_gsm.csv


--------------------------------------------------------------------------------
/section3/vllm_benchmark_h100_wbase/hard_metrics_gsm.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-l3/neurips2024-inference-tutorial-code/HEAD/section3/vllm_benchmark_h100_wbase/hard_metrics_gsm.csv


--------------------------------------------------------------------------------