├── VLA.png ├── vlas.png ├── Plot_script ├── .gitkeep ├── plots │ ├── .gitkeep │ ├── forest_plot.png │ ├── VLA_FEB_Hist.png │ ├── factor_analysis.png │ ├── scale_analysis_4panel.png │ ├── decoder_analysis_2panel.png │ ├── encoder_analysis_4panel.png │ ├── encoder_domain_faceted.png │ ├── merged_decoder_encoder_6panel.png │ ├── scale_analysis_adjusted_4panel.png │ ├── domain_component_analysis_4panel.png │ ├── vla_fusion_theory_visualization_3panel.pdf │ ├── vla_fusion_theory_visualization_3panel.png │ ├── factor_loadings.csv │ ├── factor_analysis.svg │ ├── decoder_analysis_2panel.svg │ ├── forest_plot.svg │ ├── scale_analysis_4panel.svg │ └── scale_analysis_adjusted_4panel.svg ├── requirements.txt ├── README.md ├── dataset_plot.py └── top75.csv ├── benchmarkdataset.png ├── dataset_plot.py └── README.md /VLA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/VLA.png -------------------------------------------------------------------------------- /vlas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/vlas.png -------------------------------------------------------------------------------- /Plot_script/.gitkeep: -------------------------------------------------------------------------------- 1 | # Placeholder file to maintain empty directory structure -------------------------------------------------------------------------------- /Plot_script/plots/.gitkeep: -------------------------------------------------------------------------------- 1 | # Placeholder file to maintain empty directory structure -------------------------------------------------------------------------------- /benchmarkdataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/benchmarkdataset.png -------------------------------------------------------------------------------- /Plot_script/plots/forest_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/forest_plot.png -------------------------------------------------------------------------------- /Plot_script/plots/VLA_FEB_Hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/VLA_FEB_Hist.png -------------------------------------------------------------------------------- /Plot_script/plots/factor_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/factor_analysis.png -------------------------------------------------------------------------------- /Plot_script/plots/scale_analysis_4panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/scale_analysis_4panel.png -------------------------------------------------------------------------------- /Plot_script/plots/decoder_analysis_2panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/decoder_analysis_2panel.png -------------------------------------------------------------------------------- /Plot_script/plots/encoder_analysis_4panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/encoder_analysis_4panel.png -------------------------------------------------------------------------------- /Plot_script/plots/encoder_domain_faceted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/encoder_domain_faceted.png -------------------------------------------------------------------------------- /Plot_script/plots/merged_decoder_encoder_6panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/merged_decoder_encoder_6panel.png -------------------------------------------------------------------------------- /Plot_script/plots/scale_analysis_adjusted_4panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/scale_analysis_adjusted_4panel.png -------------------------------------------------------------------------------- /Plot_script/plots/domain_component_analysis_4panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/domain_component_analysis_4panel.png -------------------------------------------------------------------------------- /Plot_script/plots/vla_fusion_theory_visualization_3panel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/vla_fusion_theory_visualization_3panel.pdf -------------------------------------------------------------------------------- /Plot_script/plots/vla_fusion_theory_visualization_3panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Muhayyuddin/VLAs/HEAD/Plot_script/plots/vla_fusion_theory_visualization_3panel.png -------------------------------------------------------------------------------- /Plot_script/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.24.0 2 | pandas>=2.0.0 3 | matplotlib>=3.7.0 4 | seaborn>=0.12.0 5 | scipy>=1.10.0 6 | statsmodels>=0.14.0 7 | scikit-learn>=1.3.0 8 | -------------------------------------------------------------------------------- /Plot_script/plots/factor_loadings.csv: -------------------------------------------------------------------------------- 1 | ,Factor1_Architecture,Factor2_Scale,Factor3_Performance 2 | Fusion Depth,0.394,0.183,-0.222 3 | Vision Model Size,0.669,0.175,-0.051 4 | Language Model Size,0.706,-0.153,0.099 5 | Task Difficulty,0.148,-0.519,-0.143 6 | Sensor Modalities,0.185,0.114,0.263 7 | Dataset Size,-0.03,0.143,-0.24 8 | -------------------------------------------------------------------------------- /Plot_script/README.md: -------------------------------------------------------------------------------- 1 | # VLA Models Evaluation & Visualization 2 | 3 | Comprehensive analysis and visualization suite for Vision-Language-Action (VLA) models evaluation. 4 | 5 | ## Quick Start 6 | 7 | ### 1. Create Virtual Environment (Recommended) 8 | ```bash 9 | python3 -m venv .venv 10 | source .venv/bin/activate # On Windows: .venv\Scripts\activate 11 | ``` 12 | 13 | ### 2. Install Dependencies 14 | ```bash 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ### 3. Run Analysis 19 | ```bash 20 | python final_plots.py 21 | ``` 22 | 23 | ## Files 24 | 25 | - **`final_plots.py`** - Main script generating all visualizations 26 | - **`new_vla_models.csv`** - Dataset with 101 VLA models and evaluation metrics 27 | - **`top75.csv`** - Subset with VLA-FEB component scores (CMAS, E_fusion, R2S, GI) 28 | 29 | ## Output 30 | 31 | Plots are saved to: 32 | - `plots/` - Publication-ready figures (PNG/SVG/PDF) 33 | - Main plots include: forest plot, encoder analysis, domain analysis, fusion theory, VLA-FEB histogram 34 | 35 | ## Key Metrics 36 | 37 | - **VLA-FEB Score**: Composite metric combining Cross-Modal Alignment (CMAS), Fusion Energy (E_fusion), Real-to-Sim Transfer (R2S), and Generalization Index (GI) 38 | - **Adjusted Success**: Normalized task success rates (0-1 scale) 39 | - **Generalization Index**: Multi-task capability measure 40 | - **Difficulty Index**: Task complexity metric 41 | 42 | ## Requirements 43 | 44 | - Python 3.10+ 45 | - See `requirements.txt` for package dependencies 46 | -------------------------------------------------------------------------------- /dataset_plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from matplotlib.lines import Line2D 4 | 5 | # 1) Define per-dataset attributes (updated with corrected values and new datasets) 6 | # Descriptions: 7 | # T: Number of distinct tasks / skill types (higher is broader) 8 | # S: Scene diversity (number of unique environments) 9 | # D: Task difficulty (normalized, 0–1, higher is more challenging) 10 | # L: Task/episode length or complexity (normalized, higher = longer) 11 | # M: Number of modalities (vision, lang, proprioception, depth, audio, etc.) 12 | # Q: List of quality/success scores (per modality or overall) 13 | # A: Average annotation or benchmark score (0–1) 14 | # R: Real-robot validation (1 = yes, 0 = sim-only) 15 | dataset_attrs = { 16 | "DROID": {"T":10, "S":5, "D":0.2, "L":1.0, "M":3, "Q":[0.9,0.8,0.85], "A":0.9, "R":1}, 17 | "Open X-Embodiment": {"T":15, "S":20, "D":0.5, "L":2.0, "M":4, "Q":[0.8,0.8,0.9,0.7], "A":0.8, "R":1}, 18 | "ALFRED": {"T":30, "S":10, "D":0.8, "L":3.0, "M":4, "Q":[0.9,0.9,0.9,0.9], "A":0.95, "R":1}, 19 | "RLBench": {"T":8, "S":6, "D":0.3, "L":1.5, "M":3, "Q":[0.7,0.8,0.7], "A":0.7, "R":0}, 20 | "TEACh": {"T":12, "S":4, "D":0.6, "L":2.5, "M":3, "Q":[0.8,0.85,0.8], "A":0.85, "R":0}, 21 | "DialFRED": {"T":25, "S":10, "D":0.75, "L":3.0, "M":4, "Q":[0.85,0.9,0.9,0.85], "A":0.9, "R":1}, 22 | "EmbodiedQA": {"T":5, "S":2, "D":0.1, "L":1.0, "M":2, "Q":[0.7,0.7], "A":0.6, "R":0}, 23 | "R2R": {"T":6, "S":3, "D":0.2, "L":1.2, "M":2, "Q":[0.8,0.75], "A":0.7, "R":0}, 24 | "Ego4D": {"T":20, "S":0, "D":0.4, "L":1.0, "M":3, "Q":[0.9,0.9,0.8], "A":0.9, "R":0}, 25 | "CVDN": {"T":15, "S":5, "D":0.5, "L":2.0, "M":3, "Q":[0.85,0.8,0.8], "A":0.85, "R":0}, 26 | "CALVIN": {"T":35, "S":15, "D":0.9, "L":3.5, "M":4, "Q":[0.9,0.85,0.9,0.9], "A":0.9, "R":1}, 27 | "RoboSpatial": {"T":4, "S":1, "D":0.1, "L":0.5, "M":2, "Q":[0.6,0.65], "A":0.5, "R":0}, 28 | "CoVLA": {"T":18, "S":8, "D":0.7, "L":2.5, "M":4, "Q":[0.85,0.8,0.85,0.8], "A":0.9, "R":1}, 29 | "AgiBot World": {"T":30, "S":25, "D":0.95, "L":4.0, "M":3, "Q":[0.8,0.8,0.8], "A":0.8, "R":1}, 30 | "RoboData": {"T":25, "S":12, "D":0.7, "L":2.5, "M":4, "Q":[0.85,0.9,0.9,0.8], "A":0.9, "R":1}, 31 | "Interleave-VLA": {"T":18, "S":8, "D":0.6, "L":2.0, "M":4, "Q":[0.8,0.85,0.8,0.75], "A":0.85, "R":1}, 32 | "Iref-VLA": {"T":22, "S":10, "D":0.65, "L":3.0, "M":5, "Q":[0.9,0.9,0.85,0.9,0.8], "A":0.9, "R":1}, 33 | "RH20T": {"T":10, "S":4, "D":0.3, "L":1.5, "M":2, "Q":[0.75,0.8], "A":0.8, "R":0}, 34 | "Robo360": {"T":30, "S":15, "D":0.8, "L":3.5, "M":5, "Q":[0.9,0.85,0.9,0.85,0.9],"A":0.95, "R":1}, 35 | "REASSEMBLE": {"T":28, "S":12, "D":0.7, "L":3.0, "M":4, "Q":[0.8,0.8,0.85,0.8], "A":0.9, "R":1}, 36 | "RoboCerebra": {"T":12, "S":6, "D":0.4, "L":2.0, "M":3, "Q":[0.85,0.9,0.85], "A":0.9, "R":0}, 37 | "TLA": {"T":35, "S":18, "D":0.85, "L":3.8, "M":4, "Q":[0.9,0.9,0.9,0.85], "A":0.9, "R":1}, 38 | "Kaiwu": {"T":30, "S":20, "D":0.7, "L":4.0, "M":7, "Q":[0.9]*7, "A":0.9, "R":1}, # Source: arXiv:2503.05231 39 | "RefSpatial-Bench": {"T":2, "S":3, "D":1.0, "L":4.0, "M":2, "Q":[0.4696, 0.0582, 0.2287, 0.2191, 0.4577, 0.47, 0.52, 0.52, 0.2421, 0.0431, 0.0927, 0.1285, 0.1474, 0.48, 0.53, 0.54], "A":0.9, "R":1}, # Source: arXiv:2506.04308 40 | } 41 | 42 | # 2) Weights 43 | α1, α2, α3, α4 = 1.0, 1.0, 1.0, 1.0 44 | β1, β2, β3, β4 = 1.0, 1.0, 1.0, 1.0 45 | 46 | # 3) Compute raw task & modality scores 47 | c_task_raw = {} 48 | c_mod_raw = {} 49 | for name, a in dataset_attrs.items(): 50 | T, S, D, L = a["T"], a["S"], a["D"], a["L"] 51 | c_task_raw[name] = α1 * np.log1p(T) + α2 * S + α3 * D + α4 * L 52 | M = a["M"] 53 | Qm = np.mean(a["Q"]) 54 | A, R = a["A"], a["R"] 55 | c_mod_raw[name] = β1 * M + β2 * Qm + β3 * A + β4 * R 56 | 57 | # 4a) Normalize task to [1,5] 58 | def norm15(d): 59 | arr = np.array(list(d.values())) 60 | mn, mx = arr.min(), arr.max() 61 | return {k: 1 + 4*(v-mn)/(mx-mn) for k, v in d.items()} 62 | 63 | # 4b) Normalize modality to [2,5] 64 | def norm25(d): 65 | arr = np.array(list(d.values())) 66 | mn, mx = arr.min(), arr.max() 67 | return {k: 2 + 3*(v-mn)/(mx-mn) for k, v in d.items()} 68 | 69 | c_task = norm15(c_task_raw) 70 | c_mod = norm25(c_mod_raw) 71 | 72 | # 5) Point sizes by dataset scale 73 | raw_sizes = [ 74 | 5000, 500000, 25025, 10000, 15000, 18000, 9000, 21000, 360000, 75 | 15000, 23000, 5000, 12000, 20000, 20000, 15000, 10000, 5000, 76 | 25000, 18000, 12000, 22000, 8000, 11664 77 | ] 78 | names = list(dataset_attrs.keys()) 79 | smin, smax = min(raw_sizes), max(raw_sizes) 80 | sizes = {n: (800*(sz-smin)/(smax-smin) + 300)*6 for n, sz in zip(names, raw_sizes)} 81 | 82 | # 6) Color map 83 | colors = plt.cm.tab20(np.linspace(0, 1, len(names))) 84 | col_map = dict(zip(names, colors)) 85 | # override to distinguish R2R and Ego4D 86 | col_map["R2R"] = "tab:red" 87 | col_map["Ego4D"] = "tab:purple" 88 | 89 | # 7) Plot (no text labels on bubbles) 90 | fig, ax = plt.subplots(figsize=(12, 8)) 91 | for n in names: 92 | x, y = c_task[n], c_mod[n] 93 | ax.scatter( 94 | x, y, 95 | s=sizes[n], 96 | facecolor=col_map[n], 97 | edgecolor=col_map[n], 98 | alpha=0.7, 99 | linewidth=0.5, 100 | ) 101 | 102 | # 8) Axes settings 103 | ax.set_xlim(0.75, 5.25) 104 | ax.set_ylim(1.75, 5.25) 105 | ax.set_title("Dataset & Benchmark Landscape", fontsize=16, weight='bold') 106 | ax.set_xlabel("Task Complexity", fontsize=12) 107 | ax.set_ylabel("Modality Richness", fontsize=12) 108 | ax.set_xticks([1, 2, 3, 4, 5]) 109 | ax.set_xticklabels(["Very Low", "Low", "Medium", "High", "Very High"], rotation=25, ha="right") 110 | ax.set_yticks([2, 3, 4, 5]) 111 | ax.set_yticklabels(["Minimal", "Moderate", "Rich", "Comprehensive"]) 112 | ax.grid(True, linestyle='--', alpha=0.4) 113 | 114 | # 9) Legend 115 | handles = [ 116 | Line2D([], [], marker='o', color='w', 117 | markerfacecolor=col_map[n], markeredgecolor=col_map[n], 118 | markersize=6, label=n, linestyle='') 119 | for n in names 120 | ] 121 | ax.legend(handles=handles, 122 | loc='upper center', 123 | bbox_to_anchor=(0.5, -0.15), 124 | ncol=4, 125 | fontsize=8, 126 | frameon=True) 127 | 128 | plt.tight_layout() 129 | plt.show() 130 | -------------------------------------------------------------------------------- /Plot_script/dataset_plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from matplotlib.lines import Line2D 4 | 5 | # 1) Define per-dataset attributes (updated with corrected values and new datasets) 6 | # Descriptions: 7 | # T: Number of distinct tasks / skill types (higher is broader) 8 | # S: Scene diversity (number of unique environments) 9 | # D: Task difficulty (normalized, 0–1, higher is more challenging) 10 | # L: Task/episode length or complexity (normalized, higher = longer) 11 | # M: Number of modalities (vision, lang, proprioception, depth, audio, etc.) 12 | # Q: List of quality/success scores (per modality or overall) 13 | # A: Average annotation or benchmark score (0–1) 14 | # R: Real-robot validation (1 = yes, 0 = sim-only) 15 | dataset_attrs = { 16 | "DROID": {"T":10, "S":5, "D":0.2, "L":1.0, "M":3, "Q":[0.9,0.8,0.85], "A":0.9, "R":1}, 17 | "Open X-Embodiment": {"T":15, "S":20, "D":0.5, "L":2.0, "M":4, "Q":[0.8,0.8,0.9,0.7], "A":0.8, "R":1}, 18 | "ALFRED": {"T":30, "S":10, "D":0.8, "L":3.0, "M":4, "Q":[0.9,0.9,0.9,0.9], "A":0.95, "R":1}, 19 | "RLBench": {"T":8, "S":6, "D":0.3, "L":1.5, "M":3, "Q":[0.7,0.8,0.7], "A":0.7, "R":0}, 20 | "TEACh": {"T":12, "S":4, "D":0.6, "L":2.5, "M":3, "Q":[0.8,0.85,0.8], "A":0.85, "R":0}, 21 | "DialFRED": {"T":25, "S":10, "D":0.75, "L":3.0, "M":4, "Q":[0.85,0.9,0.9,0.85], "A":0.9, "R":1}, 22 | "EmbodiedQA": {"T":5, "S":2, "D":0.1, "L":1.0, "M":2, "Q":[0.7,0.7], "A":0.6, "R":0}, 23 | "R2R": {"T":6, "S":3, "D":0.2, "L":1.2, "M":2, "Q":[0.8,0.75], "A":0.7, "R":0}, 24 | "Ego4D": {"T":20, "S":0, "D":0.4, "L":1.0, "M":3, "Q":[0.9,0.9,0.8], "A":0.9, "R":0}, 25 | "CVDN": {"T":15, "S":5, "D":0.5, "L":2.0, "M":3, "Q":[0.85,0.8,0.8], "A":0.85, "R":0}, 26 | "CALVIN": {"T":35, "S":15, "D":0.9, "L":3.5, "M":4, "Q":[0.9,0.85,0.9,0.9], "A":0.9, "R":1}, 27 | "RoboSpatial": {"T":4, "S":1, "D":0.1, "L":0.5, "M":2, "Q":[0.6,0.65], "A":0.5, "R":0}, 28 | "CoVLA": {"T":18, "S":8, "D":0.7, "L":2.5, "M":4, "Q":[0.85,0.8,0.85,0.8], "A":0.9, "R":1}, 29 | "AgiBot World": {"T":30, "S":25, "D":0.95, "L":4.0, "M":3, "Q":[0.8,0.8,0.8], "A":0.8, "R":1}, 30 | "RoboData": {"T":25, "S":12, "D":0.7, "L":2.5, "M":4, "Q":[0.85,0.9,0.9,0.8], "A":0.9, "R":1}, 31 | "Interleave-VLA": {"T":18, "S":8, "D":0.6, "L":2.0, "M":4, "Q":[0.8,0.85,0.8,0.75], "A":0.85, "R":1}, 32 | "Iref-VLA": {"T":22, "S":10, "D":0.65, "L":3.0, "M":5, "Q":[0.9,0.9,0.85,0.9,0.8], "A":0.9, "R":1}, 33 | "RH20T": {"T":10, "S":4, "D":0.3, "L":1.5, "M":2, "Q":[0.75,0.8], "A":0.8, "R":0}, 34 | "Robo360": {"T":30, "S":15, "D":0.8, "L":3.5, "M":5, "Q":[0.9,0.85,0.9,0.85,0.9],"A":0.95, "R":1}, 35 | "REASSEMBLE": {"T":28, "S":12, "D":0.7, "L":3.0, "M":4, "Q":[0.8,0.8,0.85,0.8], "A":0.9, "R":1}, 36 | "RoboCerebra": {"T":12, "S":6, "D":0.4, "L":2.0, "M":3, "Q":[0.85,0.9,0.85], "A":0.9, "R":0}, 37 | "TLA": {"T":35, "S":18, "D":0.85, "L":3.8, "M":4, "Q":[0.9,0.9,0.9,0.85], "A":0.9, "R":1}, 38 | "Kaiwu": {"T":30, "S":20, "D":0.7, "L":4.0, "M":7, "Q":[0.9]*7, "A":0.9, "R":1}, # Source: arXiv:2503.05231 39 | "RefSpatial-Bench": {"T":2, "S":3, "D":1.0, "L":4.0, "M":2, "Q":[0.4696, 0.0582, 0.2287, 0.2191, 0.4577, 0.47, 0.52, 0.52, 0.2421, 0.0431, 0.0927, 0.1285, 0.1474, 0.48, 0.53, 0.54], "A":0.9, "R":1}, # Source: arXiv:2506.04308 40 | } 41 | 42 | # 2) Weights 43 | α1, α2, α3, α4 = 1.0, 1.0, 1.0, 1.0 44 | β1, β2, β3, β4 = 1.0, 1.0, 1.0, 1.0 45 | 46 | # 3) Compute raw task & modality scores 47 | c_task_raw = {} 48 | c_mod_raw = {} 49 | for name, a in dataset_attrs.items(): 50 | T, S, D, L = a["T"], a["S"], a["D"], a["L"] 51 | c_task_raw[name] = α1 * np.log1p(T) + α2 * S + α3 * D + α4 * L 52 | M = a["M"] 53 | Qm = np.mean(a["Q"]) 54 | A, R = a["A"], a["R"] 55 | c_mod_raw[name] = β1 * M + β2 * Qm + β3 * A + β4 * R 56 | 57 | # 4a) Normalize task to [1,5] 58 | def norm15(d): 59 | arr = np.array(list(d.values())) 60 | mn, mx = arr.min(), arr.max() 61 | return {k: 1 + 4*(v-mn)/(mx-mn) for k, v in d.items()} 62 | 63 | # 4b) Normalize modality to [2,5] 64 | def norm25(d): 65 | arr = np.array(list(d.values())) 66 | mn, mx = arr.min(), arr.max() 67 | return {k: 2 + 3*(v-mn)/(mx-mn) for k, v in d.items()} 68 | 69 | c_task = norm15(c_task_raw) 70 | c_mod = norm25(c_mod_raw) 71 | 72 | # 5) Point sizes by dataset scale 73 | raw_sizes = [ 74 | 5000, 500000, 25025, 10000, 15000, 18000, 9000, 21000, 360000, 75 | 15000, 23000, 5000, 12000, 20000, 20000, 15000, 10000, 5000, 76 | 25000, 18000, 12000, 22000, 8000, 11664 77 | ] 78 | names = list(dataset_attrs.keys()) 79 | smin, smax = min(raw_sizes), max(raw_sizes) 80 | sizes = {n: (800*(sz-smin)/(smax-smin) + 300)*6 for n, sz in zip(names, raw_sizes)} 81 | 82 | # 6) Color map 83 | colors = plt.cm.tab20(np.linspace(0, 1, len(names))) 84 | col_map = dict(zip(names, colors)) 85 | # override to distinguish R2R and Ego4D 86 | col_map["R2R"] = "tab:red" 87 | col_map["Ego4D"] = "tab:purple" 88 | 89 | # 7) Plot (no text labels on bubbles) 90 | fig, ax = plt.subplots(figsize=(12, 8)) 91 | for n in names: 92 | x, y = c_task[n], c_mod[n] 93 | ax.scatter( 94 | x, y, 95 | s=sizes[n], 96 | facecolor=col_map[n], 97 | edgecolor=col_map[n], 98 | alpha=0.7, 99 | linewidth=0.5, 100 | ) 101 | 102 | # 8) Axes settings 103 | ax.set_xlim(0.75, 5.25) 104 | ax.set_ylim(1.75, 5.25) 105 | ax.set_title("Dataset & Benchmark Landscape", fontsize=16, weight='bold') 106 | ax.set_xlabel("Task Complexity", fontsize=12) 107 | ax.set_ylabel("Modality Richness", fontsize=12) 108 | ax.set_xticks([1, 2, 3, 4, 5]) 109 | ax.set_xticklabels(["Very Low", "Low", "Medium", "High", "Very High"], rotation=25, ha="right") 110 | ax.set_yticks([2, 3, 4, 5]) 111 | ax.set_yticklabels(["Minimal", "Moderate", "Rich", "Comprehensive"]) 112 | ax.grid(True, linestyle='--', alpha=0.4) 113 | 114 | # 9) Legend 115 | handles = [ 116 | Line2D([], [], marker='o', color='w', 117 | markerfacecolor=col_map[n], markeredgecolor=col_map[n], 118 | markersize=6, label=n, linestyle='') 119 | for n in names 120 | ] 121 | ax.legend(handles=handles, 122 | loc='upper center', 123 | bbox_to_anchor=(0.5, -0.15), 124 | ncol=4, 125 | fontsize=8, 126 | frameon=True) 127 | 128 | plt.tight_layout() 129 | plt.show() 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vision–Language–Action (VLA) Models in Robotics 2 | 3 | This repository was developed alongside the paper [Vision Language Action Models in Robotic Manipulation: A Systematic Review](https://muhayyuddin.github.io/VLAs/) and provides a living catalog of: 4 | 5 | - **Dataset Benchmarking Code** 6 | Code to benchmark the datasets based on the task complexity and modality richness. 7 | 8 | - **VLA Models** 9 | Key vision–language–action models that are used in the review, with links to the original papers. 10 | - **Datasets** 11 | Major benchmarks and large‑scale collections used to train and evaluate VLA systems, including QA/navigation datasets, manipulation demonstrations, and multimodal embodiment data. 12 | - **Simulators** 13 | Widely adopted simulation platforms for generating VLA data—spanning photorealistic navigation, dexterous manipulation, multi‑robot coordination, and more—each linked to its official website. 14 | 15 | We aim to keep this list up to date as new VLA models, datasets, and simulation tools emerge. Contributions and pull requests adding recently published work or tooling are most welcome! 16 | 17 | --- 18 | 19 | ## Table of Contents 20 | - [Dataset Benchmarking Code](#Dataset-Benchmarking-Code) 21 | - [VLA Models](#vla-models) 22 | - [Datasets](#datasets) 23 | - [Simulators](#simulators) 24 | - [Reference for Citation](#reference) 25 | 26 | 27 | 28 | --- 29 | # Dataset Benchmarking Code 30 | Benchmarking VLA Datasets by Task Complexity and Modality Richness. Each bubble represents a VLA dataset, positioned according to its normalized task-complexity score (x-axis) and its modality-richness score (y-axis). The bubble area is proportional to the dataset scale that is number of annotated episodes or interactions. 31 | 32 | ![Dataset Benchmarking](https://github.com/Muhayyuddin/VLAs/blob/main/benchmarkdataset.png) 33 | 34 | [Code](https://github.com/Muhayyuddin/VLAs/blob/main/dataset_plot.py) 35 | # VLA Models 36 | 37 | ![VLA Models Trend](https://github.com/Muhayyuddin/VLAs/blob/main/VLA.png) 38 | The top row presents major VLA 39 | models introduced each year, alongside their associated institutions. The bottom row 40 | displays key datasets used to train and evaluate VLA models, grouped by release year. The figure highlights the 41 | increasing scale and diversity of datasets and institutional involvement, with contributions from academic (e.g., 42 | CMU, CNRS, UC, Peking Uni) and industrial labs (e.g., Google, NVIDIA, Microsoft). This timeline highlights 43 | the rapid advancements in VLA research. 44 | 45 | Below is the list of the VLAs reviewed in the paper 46 | 47 | [2022][Cliport: What and where pathways for robotic manipulation](https://proceedings.mlr.press/v164/shridhar22a/shridhar22a.pdf) 48 | [2022][Rt-1: Robotics transformer for real‑world control at scale](https://arxiv.org/abs/2212.06817) 49 | [2022][A Generalist Agent](https://arxiv.org/abs/2205.06175) 50 | [2022][VIMA: General Robot Manipulation with Multimodal Prompts](https://arxiv.org/abs/2210.03094) 51 | [2022][PERCEIVER-ACTOR:A Multi-Task Transformer for Robotic Manipulation](https://peract.github.io/paper/peract_corl2022.pdf)
52 | [2022][Do As I Can, Not As I Say: Grounding Language in Robotic Affordances](https://arxiv.org/abs/2204.01691) 53 | [2023][RoboAgent: Generalist Robot Agent with Semantic and Temporal Understanding](https://arxiv.org/abs/2310.08560) 54 | [2023][Robotic Task Generalization via Hindsight Trajectory Sketches](https://arxiv.org/abs/2311.01977) 55 | [2023][Learning fine‑grained bimanual manipulation with low‑cost hardware](https://arxiv.org/abs/2304.13705) 56 | [2023][Rt-2: Vision‑language‑action models transfer web knowledge to robotic control](Link TBD) 57 | [2023][Voxposer: Composable 3D value maps for robotic manipulation with language models](https://arxiv.org/abs/2307.05973) 58 | [2024][CLIP‑RT: Learning Language‑Conditioned Robotic Policies with Natural Language Supervision](https://arxiv.org/abs/2411.00508) 59 | [2023][Diffusion Policy: Visuomotor policy learning via action diffusion](https://arxiv.org/pdf/2303.04137) 60 | [2024][Octo: An open‑source generalist robot policy](https://arxiv.org/abs/2405.12213) 61 | [2024][Towards testing and evaluating vision‑language manipulation: An empirical study](https://arxiv.org/abs/2409.12894) 62 | [2024][NaVILA: Legged robot vision‑language‑action model for navigation](https://arxiv.org/abs/2412.04453) 63 | [2024][RoboNurse‑VLA: Real‑time voice‑to‑action pipeline for surgical instrument handover](https://arxiv.org/pdf/2409.19590) 64 | [2024][Mobility VLA: Multimodal instruction navigation with topological mapping](https://arxiv.org/pdf/2407.07775) 65 | [2024][ReVLA: Domain adaptation adapters for robotic foundation models](https://arxiv.org/pdf/2409.15250.pdf) 66 | [2024][Uni‑NaVid: Video‑based VLA unifying embodied navigation tasks](https://arxiv.org/pdf/2412.06224.pdf) 67 | [2024][RDT‑1B: 1.2B‑parameter diffusion foundation model for manipulation](https://arxiv.org/pdf/2410.07864.pdf) 68 | [2024][RoboMamba: Mamba‑based unified VLA with linear‑time inference](https://arxiv.org/pdf/2406.04339.pdf) 69 | [2024][Chain‑of‑Affordance: Sequential affordance reasoning for spatial planning](https://arxiv.org/pdf/2412.20451.pdf) 70 | [2024][Edge VLA:Self-Adapting Large Visual-Language Models to Edge Devices across Visual Modalities](https://arxiv.org/pdf/2403.04908) 71 | [2024][OpenVLA: LORA‑fine‑tuned open‑source VLA with high‑success transfer](https://arxiv.org/pdf/2406.09246.pdf) 72 | [2024][CogACT: Componentized diffusion action transformer for VLA](https://arxiv.org/pdf/2411.19650.pdf) 73 | [2024][ShowUI‑2B: GUI/web navigation via screenshot grounding and token selection](https://arxiv.org/pdf/2411.17465) 74 | [2024][HiRT: Hierarchical planning/control separation for VLA](https://arxiv.org/pdf/2410.05273) 75 | [2024][Pi‑0: General robot control flow model for open‑world tasks](https://arxiv.org/pdf/2410.24164.pdf)
76 | [2024][A3VLM: Articulation‑aware affordance grounding from RGB video](https://arxiv.org/pdf/2406.07549.pdf) 77 | [2024][SVLR: Modular “segment‑to‑action” pipeline using visual prompt retrieval](https://arxiv.org/pdf/2502.01071.pdf) 78 | [2024][Bi‑VLA: Dual‑arm instruction‑to‑action planner for recipe demonstrations](https://arxiv.org/pdf/2405.06039.pdf) 79 | [2024][QUAR‑VLA: Quadruped‑specific VLA with adaptive gait mapping](https://arxiv.org/pdf/2312.14457.pdf) 80 | [2024][3D‑VLA: Integrating 3D generative diffusion heads for world reconstruction](https://arxiv.org/pdf/2403.09631) 81 | [2024][RoboMM: MIM‑based multimodal decoder unifying 3D perception and language](https://arxiv.org/pdf/2412.07215.pdf) 82 | [2025][FAST: Frequency‑space action tokenization for faster inference](https://arxiv.org/pdf/2501.09747.pdf) 83 | [2025][OpenVLA‑OFT: Optimized fine‑tuning of OpenVLA with parallel decoding](https://arxiv.org/pdf/2502.19645.pdf) 84 | [2025][CoVLA: Autonomous driving VLA trained on annotated scene data](https://arxiv.org/pdf/2408.10845.pdf) 85 | [2025][ORION: Holistic end‑to‑end driving VLA with semantic trajectory control](https://arxiv.org/pdf/2503.19755.pdf) 86 | [2025][UAV‑VLA: Zero‑shot aerial mission VLA combining satellite/UAV imagery](https://arxiv.org/pdf/2501.05014.pdf) 87 | [2025][Combat VLA: Ultra‑fast tactical reasoning in 3D environments](https://arxiv.org/pdf/2503.09527.pdf) 88 | [2025][HybridVLA: Ensemble decoding combining diffusion and autoregressive policies](https://arxiv.org/pdf/2503.10631.pdf) 89 | [2025][NORA: Low‑overhead VLA with integrated visual reasoning and FAST decoding](https://arxiv.org/pdf/2504.19854.pdf) 90 | [2025][SpatialVLA: 3D spatial encoding and adaptive action discretization](https://arxiv.org/pdf/2501.15830.pdf) 91 | [2025][MoLe‑VLA: Selective layer activation for faster inference](https://arxiv.org/pdf/2503.20384.pdf) 92 | [2025][JARVIS‑VLA: Open‑world instruction following in 3D games with keyboard/mouse](https://arxiv.org/pdf/2503.16365.pdf) 93 | [2025][UP‑VLA: Unified understanding and prediction model for embodied agents](https://arxiv.org/pdf/2501.18867.pdf) 94 | [2025][Shake‑VLA: Modular bimanual VLA for cocktail‑mixing tasks](https://arxiv.org/pdf/2501.06919.pdf) 95 | [2025][MORE: Scalable mixture‑of‑experts RL for VLA models](https://arxiv.org/pdf/2503.08007.pdf) 96 | [2025][DexGraspVLA: Diffusion‑based dexterous grasping framework](https://arxiv.org/pdf/2502.20900.pdf) 97 | [2025][DexVLA: Cross‑embodiment diffusion expert for rapid adaptation](https://arxiv.org/pdf/2502.05855.pdf) 98 | [2025][Humanoid‑VLA: Hierarchical full‑body humanoid control VLA](https://arxiv.org/pdf/2502.14795.pdf) 99 | [2025][ObjectVLA: End‑to‑end open‑world object manipulation](https://arxiv.org/pdf/2502.19250.pdf) 100 | [2025][Gemini Robotics: Bringing AI into the Physical World](https://arxiv.org/pdf/2503.20020.pdf) 101 | [2025][ECoT: Robotic Control via Embodied Chain‑of‑Thought Reasoning](https://arxiv.org/pdf/2407.08693.pdf) 102 | [2025][OTTER: A Vision‑Language‑Action Model with Text‑Aware Visual Feature Extraction](https://arxiv.org/pdf/2503.03734.pdf) 103 | [2025][π‑0.5: A VLA Model with Open‑World Generalization](https://arxiv.org/pdf/2504.16054.pdf) 104 | [2025][OneTwoVLA: A Unified Model with Adaptive Reasoning](https://arxiv.org/pdf/2505.11917.pdf) 105 | [2025][Helix: A Vision-Language-Action Model for Generalist Humanoid Control](https://www.figure.ai/news/helix)
106 | [2025][SmolVLA: A Vision‑Language‑Action Model for Affordable and Efficient Robotics](https://arxiv.org/pdf/2506.01844.pdf) 107 | [2025][EF‑VLA: Vision‑Language‑Action Early Fusion with Causal Transformers](https://openreview.net/pdf/32c153a3b16174884cf62b285adbfbdcc57b163e.pdf) 108 | [2025][PD‑VLA: Accelerating vision‑language‑action inference via parallel decoding](https://arxiv.org/pdf/2503.02310.pdf) 109 | [2025][LeVERB: Humanoid Whole‑Body Control via Latent Verb Generation](https://arxiv.org/pdf/2506.13751.pdf) 110 | [2025][TLA: Tactile‑Language‑Action Model for High‑Precision Contact Tasks](https://arxiv.org/pdf/2503.08548.pdf) 111 | [2025][Interleave‑VLA: Enhancing VLM‑LLM interleaved instruction processing](https://arxiv.org/pdf/2505.02152.pdf) 112 | [2025][iRe‑VLA: Iterative reinforcement and supervised fine‑tuning for robust VLA](https://arxiv.org/pdf/2501.16664.pdf) 113 | [2025][TraceVLA: Visual trace prompting for spatio‑temporal manipulation cues](https://arxiv.org/pdf/2412.10345.pdf) 114 | [2025][OpenDrive VLA: End‑to‑End Driving with Semantic Scene Alignment](https://arxiv.org/pdf/2503.23463.pdf) 115 | [2025][V‑JEPA 2: Dual‑Stream Video JEPA for Predictive Robotic Planning](https://arxiv.org/pdf/2506.09985.pdf) 116 | [2025][Knowledge Insulating VLA: Insulation Layers for Modular VLA Training](https://arxiv.org/pdf/2505.23705.pdf) 117 | [2025][GR00T N1: Diffusion Foundation Model for Humanoid Control](https://arxiv.org/pdf/2503.14734.pdf) 118 | [2025][AgiBot World Colosseo: Unified Embodied Dataset Platform](https://arxiv.org/pdf/2503.06669.pdf) 119 | [2025][Hi Robot: Hierarchical Planning and Control for Complex Environments](https://arxiv.org/pdf/2502.19417.pdf) 120 | [2025][EnerVerse: World‑Model LLM for Long‑Horizon Manipulation](https://arxiv.org/pdf/2501.01895.pdf) 121 | [2024][FLaRe: Large-Scale RL Fine-Tuning for Adaptive Robotic Policies](https://arxiv.org/pdf/2409.16578.pdf) 122 | [2025][Beyond Sight: Sensor Fusion via Language-Grounded Attention](https://arxiv.org/pdf/2501.04693.pdf) 123 | [2025][GeoManip: Geometric Constraint Encoding for Robust Manipulation](https://arxiv.org/pdf/2501.09783.pdf) 124 | [2025][Universal Actions: Standardizing Action Dictionaries for Transfer](https://arxiv.org/pdf/2501.10105.pdf) 125 | [2025][RoboHorizon: Multi-View Environment Modeling with LLM Planning](https://arxiv.org/pdf/2501.06605.pdf) 126 | [2025][SAM2Act: Segmentation‑Augmented Memory for Object‑Centric Manipulation](https://arxiv.org/pdf/2501.18564.pdf) 127 | [2025][VLA‑Cache: Token Caching for Efficient VLA Inference](https://arxiv.org/pdf/2502.02175.pdf) 128 | [2025][Forethought VLA: Latent Alignment for Foresight‑Driven Policies](https://arxiv.org/pdf/2502.01828.pdf) 129 | [2024][GRAPE: Preference‑Guided Policy Adaptation via Feedback](https://arxiv.org/pdf/2409.16578.pdf) 130 | [2025][HAMSTER: Hierarchical Skill Decomposition for Multi‑Step Manipulation](https://arxiv.org/pdf/2502.05485.pdf) 131 | [2025][TempoRep VLA: Successor Representation for Temporal Planning](https://arxiv.org/pdf/2507.10672v1) 132 | [2025][ConRFT: Consistency Regularized Fine‑Tuning with Reinforcement](https://arxiv.org/pdf/2502.05450.pdf) 133 | [2025][RoboBERT: Unified Multimodal Transformer for Manipulation](https://arxiv.org/pdf/2502.07837.pdf) 134 | [2024][Diffusion Transformer Policy: Robust Multimodal Action Sampling](https://arxiv.org/pdf/2410.15959.pdf) 135 | [2025][GEVRM: Generative Video Modeling for Goal‑Oriented Planning](https://arxiv.org/pdf/2502.09268.pdf) 136 | [2025][SoFar: Successor‑Feature Orientation Representations](https://arxiv.org/pdf/2502.13143.pdf) 137 | [2025][ARM4R: Auto‑Regressive 4D Transition Modeling for Trajectories](https://arxiv.org/pdf/2502.13142.pdf) 138 | [2025][Magma: Foundation Multimodal Agent Model for Control](https://arxiv.org/pdf/2502.13130.pdf) 139 | [2025][An Atomic Skill Library: Modular Skill Composition for Robotics](https://arxiv.org/pdf/2501.15068.pdf) 140 | [2025][RoboBrain: Knowledge‑Grounded Policy Brain for Multimodal Tasks](https://arxiv.org/pdf/2502.21257.pdf) 141 | [2025][SafeVLA: Safety‑Aware Vision‑Language‑Action Policies](https://arxiv.org/pdf/2503.03480.pdf) 142 | [2025][CognitiveDrone: Embodied Reasoning VLA for UAV Planning](https://arxiv.org/pdf/2503.01378.pdf) 143 | [2025][VLAS: Voice‑Driven Vision‑Language‑Action Control](https://arxiv.org/pdf/2502.13508.pdf) 144 | [2025][ChatVLA: Conversational VLA for Interactive Control](https://arxiv.org/pdf/2502.14420.pdf) 145 | [2024][Diffusion‑VLA: Diffusion‑Based Policy for Generalizable Manipulation](https://arxiv.org/pdf/2412.03293.pdf) 146 | [2025][RoboRefer: Towards Spatial Referring with Reasoning in Vision-Language Models for Robotics](https://arxiv.org/pdf/2506.04308.pdf) 147 | [2025][Cross-Platform Scaling of Vision-Language-Action Models from Edge to Cloud GPUs ](https://arxiv.org/pdf/2509.11480) 148 | [2025][VOTE: Vision-Language-Action Optimization with Trajectory Ensemble Voting](https://arxiv.org/pdf/2507.05116) 149 | 150 | # Datasets 151 | [2018][EmbodiedQA: Embodied Question Answering](https://openaccess.thecvf.com/content_cvpr_2018/papers_backup/Das_Embodied_Question_Answering_CVPR_2018_paper.pdf) 152 | [2018][R2R: Vision‑and‑Language Navigation: Interpreting Visually‑Grounded Navigation Instructions in Real Environments](https://openaccess.thecvf.com/content_cvpr_2018/papers/Anderson_Vision-and-Language_Navigation_Interpreting_CVPR_2018_paper.pdf) 153 | [2020][ALFRED](https://arxiv.org/abs/1912.01734) 154 | [2020][RLBench: The Robot Learning Benchmark & Learning Environment](https://arxiv.org/pdf/1909.12271.pdf) 155 | [2019][Vision‑and‑Dialog Navigation](https://arxiv.org/abs/1907.04957) 156 | [2021][TEACh: Task‑driven Embodied Agents that Chat](https://arxiv.org/abs/2110.00534) 157 | [2022][DialFRED: Dialogue‑Enabled Agents for Embodied Instruction Following](https://arxiv.org/pdf/2202.13330.pdf) 158 | [2022][Ego4D: Around the World in 3,000 Hours of Egocentric Video](https://arxiv.org/abs/2110.07058) 159 | [2022][CALVIN: A Benchmark for Language‑Conditioned Long‑Horizon Robot Manipulation Tasks](https://arxiv.org/abs/2112.03227) 160 | [2024][DROID: A Large‑Scale In‑The‑Wild Robot Manipulation Dataset](https://droid-dataset.github.io/) 161 | [2025][Open X-Embodiment: Robotic Learning Datasets and RT‑X Models](https://arxiv.org/abs/2310.08864) 162 | [2025][RoboSpatial: Teaching Spatial Understanding via Vision‑Language Models for Robotics](https://arxiv.org/abs/2411.16537) 163 | [2024][CoVLA: Comprehensive Vision‑Language‑Action Dataset for Autonomous Driving](https://arxiv.org/abs/2408.10845) 164 | [2025][TLA: Tactile‑Language‑Action Model for Contact‑Rich Manipulation](https://arxiv.org/abs/2503.08548) 165 | [2023][BridgeData V2: A Dataset for Robot Learning at Scale](https://proceedings.mlr.press/v229/walke23a/walke23a.pdf) 166 | [2023][LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning](https://proceedings.neurips.cc/paper_files/paper/2023/file/8c3c666820ea055a77726d66fc7d447f-Paper-Datasets_and_Benchmarks.pdf) 167 | [2025][Kaiwu: A Multimodal Manipulation Dataset and Framework for Robotic Perception and Interaction](https://arxiv.org/abs/2503.05231) 168 | [2025][PLAICraft: Large‑Scale Time‑Aligned Vision‑Speech‑Action Dataset for Embodied AI](https://arxiv.org/abs/2505.12707) 169 | [2025][AgiBot World Colosseo: A Large‑Scale Manipulation Dataset for Intelligent Embodied Systems](https://arxiv.org/abs/2503.06669) 170 | [2023][Robo360: A 3D Omnispective Multi‑Modal Robotic Manipulation Dataset](https://arxiv.org/abs/2312.06686) 171 | [2025][REASSEMBLE: A Multimodal Dataset for Contact‑Rich Robotic Assembly and Disassembly](https://arxiv.org/abs/2502.05086) 172 | [2025][RoboCerebra: A Large‑Scale Benchmark for Long‑Horizon Robotic Manipulation Evaluation](https://arxiv.org/abs/2506.06677) 173 | [2025][IRef‑VLA: A Benchmark for Interactive Referential Grounding with Imperfect Language in 3D Scenes](https://arxiv.org/abs/2503.17406) 174 | [2025][Interleave‑VLA: Enhancing Robot Manipulation with Interleaved Image‑Text Instructions](https://arxiv.org/abs/2406.07000) 175 | [2024][RoboMM: All‑in‑One Multimodal Large Model for Robotic Manipulation](https://arxiv.org/abs/2412.07215) 176 | [2024][All Robots in One: A New Standard and Unified Dataset for Versatile, General‑Purpose Embodied Agents](https://arxiv.org/abs/2408.10899) 177 | [2025][RoboRefer: Towards Spatial Referring with Reasoning in Vision-Language Models for Robotics](https://arxiv.org/pdf/2506.04308.pdf) 178 | 179 | # Simulators 180 | [2017][AI2-THOR:][AI2-THOR](https://ai2thor.allenai.org) 181 | [2019][Habitat:][Habitat](https://aihabitat.org) 182 | [2020][NVIDIA Isaac Sim:][NVIDIA Isaac Sim](https://developer.nvidia.com/isaac-sim) 183 | [2004][Gazebo:][Gazebo](http://gazebosim.org) 184 | [2016][PyBullet:][PyBullet](https://pybullet.org) 185 | [2013][CoppeliaSim:][CoppeliaSim](https://www.coppeliarobotics.com) 186 | [2004][Webots:][Webots](https://cyberbotics.com) 187 | [2018][Unity ML‑Agents:][Unity ML‑Agents](https://unity-technologies.github.io/ml-agents/) 188 | [2012][MuJoCo:][MuJoCo](https://mujoco.org) 189 | [2020][iGibson:][iGibson](https://svl.stanford.edu/igibson) 190 | [2023][UniSim:][UniSim](https://universal-simulator.github.io/unisim/) 191 | [2020][SAPIEN:][SAPIEN](https://sapien.ucsd.edu) 192 | 193 | # Reference 194 | ``` 195 | @article{din2025multimodal, 196 | title={Multimodal Fusion with Vision-Language-Action Models for Robotic Manipulation: A Systematic Review}, 197 | author={Muhayy, Ud Din and Akram, Waseem and Saoud, Lyes Saad and Rosell, Jan and Hussain, Irfan}, 198 | journal={Information Fusion}, 199 | year={2025}, 200 | publisher={Elsevier} 201 | } 202 | ``` 203 | 204 | 205 | -------------------------------------------------------------------------------- /Plot_script/plots/factor_analysis.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2025-11-18T00:03:19.842211 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.10.7, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | Fusion Depth 49 | 50 | 51 | 52 | 53 | 56 | 57 | 58 | Vision Model Size 59 | 60 | 61 | 62 | 63 | 66 | 67 | 68 | Language Model Size 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | Task Difficulty 79 | 80 | 81 | 82 | 83 | 86 | 87 | 88 | Sensor Modalities 89 | 90 | 91 | 92 | 93 | 96 | 97 | 98 | Dataset Size 99 | 100 | 101 | 102 | Model Features 103 | 104 | 105 | 106 | 107 | 108 | 111 | 112 | 113 | Factor1_Architecture 114 | 115 | 116 | 117 | 118 | 121 | 122 | 123 | Factor2_Scale 124 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | Factor3_Performance 134 | 135 | 136 | 137 | Latent Factors 138 | 139 | 140 | 141 | 147 | 153 | 159 | 165 | 171 | 177 | 183 | 189 | 195 | 201 | 207 | 213 | 219 | 225 | 231 | 237 | 243 | 249 | 250 | 251 | 0.39 252 | 253 | 254 | 0.67 255 | 256 | 257 | 0.71 258 | 259 | 260 | 0.15 261 | 262 | 263 | 0.18 264 | 265 | 266 | -0.03 267 | 268 | 269 | 0.18 270 | 271 | 272 | 0.18 273 | 274 | 275 | -0.15 276 | 277 | 278 | -0.52 279 | 280 | 281 | 0.11 282 | 283 | 284 | 0.14 285 | 286 | 287 | -0.22 288 | 289 | 290 | -0.051 291 | 292 | 293 | 0.099 294 | 295 | 296 | -0.14 297 | 298 | 299 | 0.26 300 | 301 | 302 | -0.24 303 | 304 | 305 | 306 | 307 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | −0.4 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | −0.2 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 0.0 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 0.2 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 0.4 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 0.6 379 | 380 | 381 | 382 | Factor Loading 383 | 384 | 385 | 387 | 388 | 389 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | -------------------------------------------------------------------------------- /Plot_script/plots/decoder_analysis_2panel.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2025-11-18T00:03:20.856855 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.10.7, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 42 | 43 | 46 | 47 | 48 | diffusion 49 | 50 | 51 | 52 | 53 | 56 | 57 | 58 | planner 59 | 60 | 61 | 62 | 63 | 66 | 67 | 68 | autoregressive 69 | 70 | 71 | 72 | 73 | 76 | 77 | 78 | mlp 79 | 80 | 81 | 82 | Decoder Family 83 | 84 | 85 | 86 | 87 | 88 | 91 | 92 | 93 | 0.0 94 | 95 | 96 | 97 | 98 | 101 | 102 | 103 | 0.1 104 | 105 | 106 | 107 | 108 | 111 | 112 | 113 | 0.2 114 | 115 | 116 | 117 | 118 | 121 | 122 | 123 | 0.3 124 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | 0.4 134 | 135 | 136 | 137 | 138 | 141 | 142 | 143 | 0.5 144 | 145 | 146 | 147 | 148 | 151 | 152 | 153 | 0.6 154 | 155 | 156 | 157 | 158 | 161 | 162 | 163 | 0.7 164 | 165 | 166 | 167 | 168 | 171 | 172 | 173 | 0.8 174 | 175 | 176 | 177 | Normalized Success 178 | 179 | 180 | 181 | 187 | 188 | 189 | 195 | 196 | 197 | 203 | 204 | 205 | 211 | 212 | 213 | 216 | 219 | 222 | 225 | 226 | 227 | 228 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 251 | 252 | 253 | 256 | 257 | 258 | 0.522 259 | 260 | 261 | 0.345 262 | 263 | 264 | 0.330 265 | 266 | 267 | 0.284 268 | 269 | 270 | (a) Success Rate by Decoder Family 271 | 272 | 273 | 274 | 275 | 281 | 282 | 283 | 284 | 285 | 288 | 289 | 290 | diffusion 291 | 292 | 293 | 294 | 295 | 298 | 299 | 300 | planner 301 | 302 | 303 | 304 | 305 | 308 | 309 | 310 | mlp 311 | 312 | 313 | 314 | 315 | 318 | 319 | 320 | autoregressive 321 | 322 | 323 | 324 | Decoder Family 325 | 326 | 327 | 328 | 329 | 330 | 333 | 334 | 335 | 0.0 336 | 337 | 338 | 339 | 340 | 343 | 344 | 345 | 0.1 346 | 347 | 348 | 349 | 350 | 353 | 354 | 355 | 0.2 356 | 357 | 358 | 359 | 360 | 363 | 364 | 365 | 0.3 366 | 367 | 368 | 369 | 370 | 373 | 374 | 375 | 0.4 376 | 377 | 378 | 379 | Generalization Index 380 | 381 | 382 | 383 | 389 | 390 | 391 | 397 | 398 | 399 | 405 | 406 | 407 | 413 | 414 | 415 | 418 | 421 | 424 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 448 | 449 | 450 | 453 | 454 | 455 | 0.271 456 | 457 | 458 | 0.256 459 | 460 | 461 | 0.253 462 | 463 | 464 | 0.219 465 | 466 | 467 | (b) Generalization Index by Decoder Family 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | -------------------------------------------------------------------------------- /Plot_script/plots/forest_plot.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2025-11-18T00:03:18.734840 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.10.7, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 47 | 48 | 49 | 55 | 56 | 57 | 63 | 64 | 65 | 71 | 72 | 73 | 74 | 75 | 78 | 79 | 80 | −0.2 81 | 82 | 83 | 84 | 85 | 88 | 89 | 90 | −0.1 91 | 92 | 93 | 94 | 95 | 98 | 99 | 100 | 0.0 101 | 102 | 103 | 104 | 105 | 108 | 109 | 110 | 0.1 111 | 112 | 113 | 114 | 115 | 118 | 119 | 120 | 0.2 121 | 122 | 123 | 124 | 125 | 128 | 129 | 130 | 0.3 131 | 132 | 133 | 134 | 135 | 138 | 139 | 140 | 0.4 141 | 142 | 143 | 144 | 145 | 148 | 149 | 150 | 0.5 151 | 152 | 153 | 154 | Standardized Coefficients (β) 155 | 156 | 157 | 158 | 159 | 160 | 163 | 164 | 165 | 166 | 167 | 168 | D 169 | f 170 | 171 | 172 | 173 | 174 | 175 | 176 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | h 186 | i 187 | e 188 | r 189 | a 190 | r 191 | c 192 | h 193 | i 194 | c 195 | a 196 | l 197 | 198 | 199 | 200 | 201 | 202 | 203 | 206 | 207 | 208 | 209 | 210 | 211 | S 212 | v 213 | 214 | 215 | 216 | 217 | 218 | 219 | 222 | 223 | 224 | 225 | 226 | 227 | S 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 238 | 239 | 240 | 241 | 242 | 243 | C 244 | t 245 | a 246 | s 247 | k 248 | 249 | 250 | 251 | 252 | 253 | 254 | 257 | 258 | 259 | 260 | 261 | 262 | C 263 | m 264 | o 265 | d 266 | 267 | 268 | 269 | 270 | 271 | 272 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | f 282 | l 283 | o 284 | w 285 | 286 | 287 | 288 | 289 | 290 | 291 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | d 301 | i 302 | f 303 | f 304 | u 305 | s 306 | i 307 | o 308 | n 309 | 310 | 311 | 312 | 313 | 314 | 315 | 318 | 319 | 320 | 323 | 324 | 325 | 328 | 329 | 330 | 333 | 334 | 335 | 338 | 339 | 340 | 343 | 344 | 345 | 348 | 349 | 350 | 353 | 354 | 355 | 358 | 359 | 360 | 363 | 364 | 365 | 368 | 369 | 370 | 371 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | -0.029 463 | 464 | 465 | 0.223 466 | 467 | 468 | 0.018 469 | 470 | 471 | 0.022 472 | 473 | 474 | -0.015 475 | 476 | 477 | -0.006 478 | 479 | 480 | 0.117 481 | 482 | 483 | 0.222 484 | 485 | 486 | 487 | 498 | 499 | 500 | 511 | 512 | 513 | 519 | 520 | 521 | Architecture Design 522 | 523 | 524 | 530 | 531 | 532 | Model Scale 533 | 534 | 535 | 541 | 542 | 543 | Task Complexity 544 | 545 | 546 | 552 | 553 | 554 | Decoder Policy 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | -------------------------------------------------------------------------------- /Plot_script/plots/scale_analysis_4panel.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2025-11-18T00:03:19.102506 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.10.7, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 44 | 45 | 46 | 49 | 50 | 51 | 54 | 55 | 56 | 59 | 60 | 61 | 64 | 65 | 66 | 67 | 68 | Small 69 | 70 | 71 | 72 | 73 | Medium 74 | 75 | 76 | 77 | 78 | Large 79 | 80 | 81 | 82 | Vision Model Size 83 | 84 | 85 | 86 | 87 | 88 | 91 | 92 | 93 | 0.0 94 | 95 | 96 | 97 | 98 | 101 | 102 | 103 | 0.2 104 | 105 | 106 | 107 | 108 | 111 | 112 | 113 | 0.4 114 | 115 | 116 | 117 | 118 | 121 | 122 | 123 | 0.6 124 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | 0.8 134 | 135 | 136 | 137 | 138 | 141 | 142 | 143 | 1.0 144 | 145 | 146 | 147 | Normalized Success 148 | 149 | 150 | 151 | 158 | 159 | 160 | 163 | 164 | 165 | 168 | 169 | 170 | 173 | 174 | 175 | 178 | 179 | 180 | 181 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 206 | 207 | 208 | 211 | 212 | 213 | 216 | 217 | 218 | 221 | 222 | 223 | 226 | 227 | 228 | 229 | 236 | 237 | 238 | 241 | 242 | 243 | 246 | 247 | 248 | 251 | 252 | 253 | 256 | 257 | 258 | 259 | 262 | 263 | 264 | 267 | 268 | 269 | 272 | 273 | 274 | 277 | 278 | 279 | 282 | 283 | 284 | Vision Model Scale Impact 285 | 286 | 287 | 288 | 289 | 295 | 296 | 297 | 300 | 301 | 302 | 305 | 306 | 307 | 310 | 311 | 312 | 315 | 316 | 317 | 320 | 321 | 322 | 323 | 324 | Small 325 | 326 | 327 | 328 | 329 | Medium 330 | 331 | 332 | 333 | 334 | Large 335 | 336 | 337 | 338 | Language Model Size 339 | 340 | 341 | 342 | 343 | 344 | 347 | 348 | 349 | 0.0 350 | 351 | 352 | 353 | 354 | 357 | 358 | 359 | 0.2 360 | 361 | 362 | 363 | 364 | 367 | 368 | 369 | 0.4 370 | 371 | 372 | 373 | 374 | 377 | 378 | 379 | 0.6 380 | 381 | 382 | 383 | 384 | 387 | 388 | 389 | 0.8 390 | 391 | 392 | 393 | 394 | 397 | 398 | 399 | 1.0 400 | 401 | 402 | 403 | Normalized Success 404 | 405 | 406 | 407 | 414 | 415 | 416 | 419 | 420 | 421 | 424 | 425 | 426 | 429 | 430 | 431 | 434 | 435 | 436 | 437 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 464 | 465 | 466 | 469 | 470 | 471 | 474 | 475 | 476 | 479 | 480 | 481 | 484 | 485 | 486 | 487 | 494 | 495 | 496 | 499 | 500 | 501 | 504 | 505 | 506 | 509 | 510 | 511 | 514 | 515 | 516 | 517 | 520 | 521 | 522 | 525 | 526 | 527 | 530 | 531 | 532 | 535 | 536 | 537 | 540 | 541 | 542 | Language Model Scale Impact 543 | 544 | 545 | 546 | 547 | 553 | 554 | 555 | 558 | 559 | 560 | 563 | 564 | 565 | 568 | 569 | 570 | 573 | 574 | 575 | 578 | 579 | 580 | 581 | 582 | early 583 | 584 | 585 | 586 | 587 | late 588 | 589 | 590 | 591 | 592 | hierarch 593 | 594 | 595 | 596 | Fusion Depth 597 | 598 | 599 | 600 | 601 | 602 | 605 | 606 | 607 | 0.0 608 | 609 | 610 | 611 | 612 | 615 | 616 | 617 | 0.2 618 | 619 | 620 | 621 | 622 | 625 | 626 | 627 | 0.4 628 | 629 | 630 | 631 | 632 | 635 | 636 | 637 | 0.6 638 | 639 | 640 | 641 | 642 | 645 | 646 | 647 | 0.8 648 | 649 | 650 | 651 | 652 | 655 | 656 | 657 | 1.0 658 | 659 | 660 | 661 | Normalized Success 662 | 663 | 664 | 665 | 672 | 673 | 674 | 677 | 678 | 679 | 682 | 683 | 684 | 687 | 688 | 689 | 692 | 693 | 694 | 695 | 702 | 703 | 704 | 707 | 708 | 709 | 712 | 713 | 714 | 717 | 718 | 719 | 722 | 723 | 724 | 725 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 751 | 752 | 753 | 756 | 757 | 758 | 761 | 762 | 763 | 766 | 767 | 768 | 771 | 772 | 773 | 774 | 777 | 778 | 779 | 782 | 783 | 784 | 787 | 788 | 789 | 792 | 793 | 794 | 797 | 798 | 799 | Fusion Depth Impact 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | -------------------------------------------------------------------------------- /Plot_script/top75.csv: -------------------------------------------------------------------------------- 1 | Model,End_to_End,Component_Focused,Main_Contribution,Dataset,Vision_Encoder,Language_Encoder,Action_Decoder,FusionDepth,FusionType,DecoderFamily,Domain,VisionParams,LLMParams,CTask,CMod,LogN,Success,Adjusted_Success_0to1,Success_Provenance,Difficulty_Index,Generalization_Index_0to1,flag_zero_shot,flag_cross_embodiment,flag_sim2real,GInew_0to1,CMAS_raw,E_fusion_raw,R2S_raw,GI_actual,CMAS,E_fusion,R2S,VLA_FEB_Score 2 | CLIPort,Yes,Yes,Pioneered the semantic grounding of visuomotor policies by integrating CLIP features into dense transport maps for precise pick-and-place.,Self-collected visuomotor demos,CLIP-ResNet50 + Transporter-ResNet,CLIP text encoder,LingUNet,early,additive,autoregressive,manipulation,7.0,7.0,0.7127598674935811,0.5237479914465233,3.580369559845169,0.1173249066721428,0.3525304888455273,proxy_existing,0.0,0.12,False,False,False,0.2115182933073163,0.3888888888888889,0.3733065489834531,2.0,0.2115182933073163,0.0,0.380879563654579,1.0,0.3980994642404738 3 | RT-1,Yes,Yes,Introduced a discretized action transformer for scalable multi-task kitchen manipulation.,Self-collected RT-1-Kitchen,EfficientNet CNN,Universal Sentence Encoder,Discretized action transformer head,late,additive,autoregressive,manipulation,7.0,7.5,0.7229760638914537,0.7925844786523562,4.914731676171138,0.0811252643996296,0.3151200916749392,proxy_existing,0.0,0.12,False,False,False,0.1890720550049635,0.4027777777777778,0.5730196066775404,2.0,0.1890720550049635,0.125,0.7089581308826494,1.0,0.5057575464719032 4 | Gato,Yes,Yes,"Demonstrated a unified tokenization scheme across vision, language, and control tasks, achieving zero-shot transfer across domains.",Self-collected multi-domain tasks,custom ViT,Sentence Piece tokenizer,Autoregressive Transformer,mid,additive,autoregressive,manipulation,8.0,7.5,0.8974576185571919,0.8359025511407572,3.3239110568563475,-0.1100576750589249,0.1299884343793348,proxy_existing,0.44999999999999996,0.12,True,False,True,0.4279930606276009,0.4305555555555556,0.7501871128926653,0.0,0.4279930606276009,0.375,1.0,0.0,0.4507482651569002 5 | VIMA,Yes,Yes,Handled six distinct vision-language grounding tasks via a prompt-based multimodal policy.,VIMA self-collected,Mask R-CNN,T5-base,Transformer policy head,late,additive,autoregressive,manipulation,7.0,8.0,0.7991721913128478,0.8380069757685371,6.399450830221941,0.0118810966338154,0.2435599420368611,proxy_existing,0.0,0.12,False,False,False,0.1461359652221166,0.4166666666666667,0.6697118711603943,2.0,0.1461359652221166,0.25,0.8677993201361028,1.0,0.5659838213395548 6 | PerAct,Yes,No,Uses voxel-based representation with language conditioning for high-precision manipulation; operates directly on point cloud voxels.,RLBench,Perceiver Transformer + voxel grid encoder,CLIP text encoder,Transformer voxel policy head,late,additive,autoregressive,manipulation,7.5,7.0,0.4591508171188656,0.6574843244062099,6.160410617474682,-0.0523324853515879,0.177198634161614,proxy_existing,0.0,0.12,False,False,False,0.1063191804969684,0.4027777777777778,0.3018844647939565,0.0,0.1063191804969684,0.125,0.2635509557013374,0.0,0.1237175340495764 7 | RoboAgent,Yes,No,MT-ACT: multi-task transformer policy with semantically augmented CVAE encoding and action-chunking for strong real-world generalization.,RoboSet teleop demos,Multi-view CNN encoder,Semantic transformer encoder,CVAE + Chunked trajectory predictor,late,additive,autoregressive,manipulation,7.5,7.5,0.8449101813042241,0.7252329205938499,4.560081470746022,0.1842666858742209,0.4217112428541688,proxy_existing,0.0,0.12,False,False,False,0.2530267457125013,0.4166666666666667,0.6127566784267418,2.0,0.2530267457125013,0.25,0.7742361938988714,1.0,0.5693157349028432 8 | RT-Trajectory,Yes,No,Conditioned policies on user-sketched trajectories to improve generalization to novel layouts and paths.,RT-1 dataset,EfficientNet-B3,,Sketch-conditioned behavioral cloning policy,late,additive,autoregressive,manipulation,7.0,7.5,0.6418766670542628,0.7605357478913903,3.8338340004908176,0.0241846148409261,0.2834099680141348,proxy_existing,0.0,0.12,False,False,False,0.1700459808084808,0.4027777777777778,0.4881701510321467,2.0,0.1700459808084808,0.125,0.5695717123898609,1.0,0.4661544232995854 9 | ACT,Yes,Yes,Applied temporal ensembling to achieve smooth bimanual manipulation with 0.1 mm precision.,self-collected demos on ALOHA,ResNet-18,none,CVAE-Transformer head,late,additive,autoregressive,manipulation,7.0,7.5,0.5831591106725926,0.7786948181947049,6.689238755674514,-0.1450580825488966,0.0813716811514936,proxy_existing,0.0,0.12,False,False,False,0.0488230086908961,0.4027777777777778,0.4541029776637802,0.0,0.0488230086908961,0.125,0.5136078732836368,0.0,0.1718577204936332 10 | RT-2,Yes,Yes,"First large VLA co-finetuned on Internet VQA and robot data, unlocking emergent multi-robot zero-shot capabilities.",Internet VQA + RT-1-Kitchen,PaLI-X/PaLM-E ViT,PaLI-X/PaLM-E text encoder,Symbol-tuning transformer,late,additive,autoregressive,manipulation,8.0,7.5,0.8284117485771623,0.7932610098464953,3.5127330152894785,0.0846149942550634,0.3693478153018754,proxy_existing,0.5,0.64,True,True,False,0.6216086891811252,0.4305555555555556,0.6571467402450207,2.0,0.6216086891811252,0.375,0.8471579549928013,1.0,0.7109416610434817 11 | VoxPoser,Yes,Yes,Achieved zero-shot constraint-aware motion planning by composing a frozen VLM and LLM without additional training.,Self-collected motion demos+RLBench,OWL-ViT,GPT-4,MPC optimizer,late,additive,autoregressive,manipulation,8.0,7.5,0.3400593849515136,0.713772575414488,4.99268770790803,0.2066548795114432,0.5469016664199399,proxy_existing,0.3,0.44,True,False,False,0.578140999851964,0.4305555555555556,0.2427250629907086,2.0,0.578140999851964,0.375,0.1663668656584663,1.0,0.5298769663776075 12 | Diffusion Policy,Yes,Yes,Introduced diffusion-based policy modeling for multimodal visuomotor action distributions.,Self-collected demos,ResNet-18,,diffusion policy network,late,diffusion,diffusion,manipulation,7.0,7.5,0.6004810936459812,0.6980436623114735,5.503005149695753,0.5318835886685056,0.7809547486005879,proxy_existing,0.0,0.12,False,False,False,0.4685728491603527,0.4027777777777778,0.4191620217574396,1.4682813405760389,0.4685728491603527,0.125,0.4562086281682243,0.7341406702880194,0.445980536904149 13 | Octo,Yes,Yes,"First generalist diffusion policy trained on 4 M+ trajectories across 22 robot platforms, demonstrating broad transfer.",Open X-Embodiment,CNN encoder,T5-base,Diffusion Transformer head,late,diffusion,diffusion,manipulation,7.5,8.0,0.3227090520727584,0.8295345763188728,4.458287302375932,0.4658772778695578,0.7504740851197964,proxy_existing,0.15,0.8,False,False,True,0.5502844510718778,0.4305555555555556,0.2676983167854407,1.6108836398969508,0.5502844510718778,0.375,0.2073916710397382,0.8054418199484754,0.4845294855150229 14 | RevLA,Yes,No,Domain adaptation adapters to improve the generalization of robotic foundation models across visual domains.,Open X-Embodiment (OXE),DINO-v2 + SigLIP,LLama-7B,"Llama head, outputs 7 discrete action tokens",mid,additive,mlp,manipulation,8.0,8.0,0.7481729443270889,0.728190135386235,4.388740632873995,0.0352284522771958,0.281859914872038,proxy_existing,0.0,0.4,False,False,False,0.1691159489232228,0.4444444444444444,0.544812157621861,2.0,0.1691159489232228,0.5,0.6626203521805607,1.0,0.5829340752759459 15 | RDT-1B,Yes,Yes,1.2B-parameter diffusion foundation model excelling at bimanual manipulation and zero-shot generalization.,self-collected 6K ALOHA episodes,SigLIP,T5-XXL,Diffusion Transformer + MLP decoder,late,diffusion,diffusion,manipulation,8.0,8.0,0.6720586562785535,0.7670028742560422,5.147391144169445,0.5449403929944054,0.8785663095298731,proxy_existing,0.3,0.12,True,False,False,0.7771397857179239,0.4444444444444444,0.515470921034304,1.612224604423649,0.7771397857179239,0.5,0.6144200443414478,0.8061123022118245,0.6744180330677991 16 | RoboMamba,Yes,No,Mamba-based unified VLA with linear-time inference for real-time robotic reasoning.,SAPIEN sim benchmarks + real-world demos,Mamba VLM visual backbone,Mamba VLM text backbone,MLP policy head for SE(3) pose predicting,late,additive,mlp,manipulation,9.0,9.0,0.5989190983038765,0.7379112010598219,6.041328838185546,0.0706039411423014,0.304246865943765,proxy_existing,0.0,0.24,False,False,False,0.182548119566259,0.5,0.441949111167079,2.0,0.182548119566259,1.0,0.4936421126440558,1.0,0.6690475580525787 17 | Edge VLA,Yes,Yes,"Lightweight, edge-optimized VLA for low-power real-time inference.",OXE + Bridge robotics set,SigLIP + DINOV2,Qwen2,Non-autoregressive control head,late,additive,autoregressive,manipulation,8.0,9.0,0.8094491324648676,0.802111645826587,4.244493746892606,0.275430868818484,0.5159245576157531,proxy_existing,0.0,0.24,False,False,False,0.3095547345694518,0.4722222222222222,0.649268575854298,1.873154450076402,0.3095547345694518,0.75,0.834216102739682,0.9365772250382012,0.7075870155868338 18 | OpenVLA,Yes,Yes,LORA-fine-tuned open-source VLA achieving efficient transfer and high success.,OXE + DROID robot data,DINOv2 + SigLIP,Llama 2,Llama 2 output head (predicts discretized action tokens as output),late,additive,mlp,manipulation,8.0,8.0,0.8710812584877896,0.777229394845665,3.9601407565225455,0.1550862384637974,0.3915548087374619,proxy_existing,0.15,0.24,False,False,True,0.3349328852424771,0.4444444444444444,0.677029959395865,2.0,0.3349328852424771,0.5,0.8798211074486751,1.0,0.6786884981727881 19 | CogACT,Yes,Yes,"Componentized diffusion action transformer, +59.1% success over OpenVLA with specialized adaptation.",OXE subset + real trials,DINOv2 + SigLIP,LLaMA-2,Diffusion Transformer head,late,diffusion,diffusion,manipulation,8.0,8.0,0.4683348176971607,0.6784157112355569,4.188797063286583,0.5986942506852001,0.85,proxy_existing,0.0,0.24,False,False,False,0.51,0.4444444444444444,0.3177256984443941,1.41975640993242,0.51,0.5,0.2895741376563275,0.70987820496621,0.5023630856556345 20 | Pi-0,Yes,No,"General robot control flow model for high-frequency, open-world tasks.",Extended OXE called Pi-Cross-Embodiment,PaliGemma (SigLIP),PaliGemma (Gemma-2B),diffusion-based Flow matching action expert head,late,diffusion,diffusion,manipulation,8.0,8.0,0.8615764604594311,0.7188576246242815,6.057246465169008,0.3782902147459149,0.6222241987802719,proxy_existing,0.5,0.12,True,True,False,0.7733345192681632,0.4444444444444444,0.6193508077980628,1.6448329206670056,0.7733345192681632,0.5,0.7850686979986953,0.8224164603335028,0.7202049194000903 21 | HiRT,Yes,Yes,"Hierarchical planning/control separation, doubling execution speed and improving dynamic task success.",Self collected Real-world data,InstructBLIP,LLaMA-2,Latent-conditioned policy head (MLP),hierarch,additive,mlp,manipulation,8.0,8.0,0.7424706261681373,0.6183283851437078,3.651017809805437,0.3866988128803661,0.7088504756510811,proxy_existing,0.0,0.32,False,False,False,0.4253102853906486,0.4444444444444444,0.4590906632951819,1.83308159228919,0.4253102853906486,0.5,0.5218013923986502,0.916540796144595,0.5909131184834735 22 | QUAR-VLA,Yes,No,"Quadruped-specific VLA with adaptive gait and body command mapping, strong sim-to-real transfer.",QUART locomotion + manipulation,EfficientNet-B3,FiLM / VLM tokenizer,Transformer decoder (discrete tokens),late,additive,autoregressive,manipulation,7.0,7.5,0.8789528921693748,0.5517622731134857,6.378896212281935,-0.0875016671504077,0.1507957390644215,proxy_existing,0.15,0.24,False,False,True,0.1904774434386529,0.4027777777777778,0.4849730457430467,0.0,0.1904774434386529,0.125,0.5643196686094694,0.0,0.2199492780120305 23 | 3D-VLA,Yes,Yes,"Integrates 3D generative diffusion heads for world reconstruction, enabling planning in RGB+D and point-cloud spaces.",3D-language-action pairs,3D-aware transformer,3D-LLM,Multi-head diffusion planner,hierarch,diffusion,diffusion,manipulation,7.5,7.5,0.4384030747261169,0.7369067048032512,6.391823052221341,0.4038028464384491,0.7287100998409468,proxy_existing,0.0,0.32,False,False,False,0.4372260599045681,0.4166666666666667,0.3230621651720363,1.8046185317121648,0.4372260599045681,0.25,0.2983406168153762,0.9023092658560824,0.4719689856440067 24 | FAST,Yes,Yes,Frequency-space action tokenization for up to 15 times faster inference on general robot control.,DROID,PaliGemma (SigLIP),PaliGemma (Gemma-2B),FAST token generator,late,additive,mlp,manipulation,8.0,8.0,0.830380567853223,0.6192107215333876,5.878904525335356,0.073376766305158,0.3071124326763355,proxy_existing,0.0,0.12,False,False,False,0.1842674596058013,0.4444444444444444,0.5141805505676984,2.0,0.1842674596058013,0.5,0.6123002886341077,1.0,0.5741419370599772 25 | OpenVLA-OFT,Yes,Yes,"Optimized fine-tuning of OpenVLA with parallel chunked decoding, achieving 97.1 % success on LIBERO dataset and 26 time speed-up.",LIBERO,SigLIP + DINOv2,LLaMA-27B,Llama 2 Parallel chunking head,late,additive,autoregressive,manipulation,8.0,8.0,0.4998374778745145,0.7382447093624558,3.0696131738583787,-0.1304647427412237,0.8253499999999999,paper_numeric,0.0,0.12,False,False,False,0.4952099999999999,0.4444444444444444,0.3690023735819339,0.0,0.4952099999999999,0.5,0.3738088807701791,0.0,0.3422547201925447 26 | HybridVLA,Yes,No,Adaptive ensemble decoding that combines diffusion and autoregressive policies for robust multi-task generalization.,RT-X trajectories + synthetic task fusion,CLIP ViT + DINOV2,LLaMA-2,Diffusion policy head,late,diffusion,diffusion,manipulation,7.0,8.0,0.6050528944718216,0.461431006893817,5.769134126612075,0.2864995796461315,0.5273634791694917,proxy_existing,0.0,0.24,False,False,False,0.316418087501695,0.4166666666666667,0.279190166320151,1.8407129246781515,0.316418087501695,0.25,0.2262699035082603,0.9203564623390758,0.4282611133372578 27 | NORA,Yes,No,Low-overhead VLA with integrated visual reasoning and FAST token decoding for real-time performance.,OXE,Qwen-2.5-VL,Qwen-2.5-VL,FAST tokenizer head,late,additive,mlp,manipulation,9.0,9.0,0.3313439391961579,0.7295266372278755,5.418017977476206,0.1491638344176242,0.3854343204214691,proxy_existing,0.0,0.12,False,False,False,0.2312605922528814,0.5,0.2417242297276107,2.0,0.2312605922528814,1.0,0.1647227471076687,1.0,0.5989958348401374 28 | SpatialVLA,Yes,No,3D spatial encoding and adaptive action discretization to improve cross-robot manipulation generality.,OXE,SigLIP,PaliGemma (Gemma-2B),Adaptive action grid head,mid,additive,autoregressive,manipulation,8.0,8.0,0.7828334685797784,0.5129886648940307,3.2933772307593845,0.0140378213852738,0.2457888018949863,proxy_existing,0.0,0.12,False,False,False,0.1474732811369918,0.4444444444444444,0.4015846958811037,2.0,0.1474732811369918,0.5,0.4273334812159282,1.0,0.51870169058823 29 | MoLe-VLA,Yes,No,Selective layer activation in a multi-stage ViT yields 5.6 time faster inference and +8% task success.,RLBench + real-world trials,"DINOv2, SigLIP",LLaMA-2,Diffusion head,late,diffusion,diffusion,manipulation,8.0,8.0,0.3381384853626232,0.7531654120903784,5.878160849259938,0.0668179041229193,0.3003341988645883,proxy_existing,0.0,0.24,False,False,False,0.1802005193187529,0.4444444444444444,0.2546742116717564,2.0,0.1802005193187529,0.5,0.1859963261661426,1.0,0.4665492113712238 30 | UP-VLA,Yes,No,"Precise 3D spatial reasoning, achieving +33 % success on the CALVIN benchmark.",CALVIN,CLIP-ViT,Phi-1.5,MLP policy head,late,additive,mlp,manipulation,7.0,7.5,0.5181920159644303,0.6334131185569574,4.196988220821242,-0.0902076214084636,0.1380567036389008,proxy_existing,0.0,0.12,False,False,False,0.0828340221833404,0.4027777777777778,0.3282296208433464,0.0,0.0828340221833404,0.125,0.3068294531109721,0.0,0.1286658688235781 31 | Shake-VLA,Yes,Yes,Modular bimanual VLA achieving 100% success on cluttered cocktail-mixing tasks.,Cocktail mixing demos,"YOLOv8, EasyOCR","GPT-4o,Whisper-1",Bimanual arm controller,late,additive,autoregressive,humanoid,7.5,7.5,0.4448401886745534,0.6276009231406356,6.762219965790784,-0.0394644513160593,0.91,paper_numeric,0.0,0.12,False,False,False,0.546,0.4166666666666667,0.2791821130622042,0.0,0.546,0.25,0.2262566740211191,0.0,0.2555641685052798 32 | DexGraspVLA,Yes,No,Diffusion-based dexterous grasping with $\geq90\%$ zero-shot success across diverse objects.,Self-collected Dexterous grasp data,DINOv2,"Qwen-VL, Qwen2.5-VL",Diffusion policy head,late,diffusion,diffusion,manipulation,8.0,9.0,0.6385237394884777,0.7751252151945127,4.630556554737348,0.352775558805779,0.846,paper_numeric,0.3,0.12,True,False,False,0.7575999999999999,0.4722222222222222,0.4949358509778113,2.0,0.7575999999999999,0.75,0.5806860640006037,1.0,0.772071516000151 33 | DexVLA,Yes,No,Cross-embodiment diffusion expert enabling rapid adaptation without per-task tuning.,"OXE, RLBench","Qwen2-VL (ViT), ResNet-50","Qwen2-VL,DistilBERT",Diffusion Transformer head,late,diffusion,diffusion,manipulation,7.0,9.0,0.5241957388462013,0.7614303867871333,4.111684899578842,0.4168179284148171,0.6620405334163396,proxy_existing,0.2,0.24,False,True,False,0.5472243200498037,0.4444444444444444,0.3991385641818301,1.5883206750105938,0.5472243200498037,0.5,0.4233150990810048,0.7941603375052969,0.5661749391590263 34 | Humanoid-VLA,Yes,No,"Hierarchical VLA for full-body humanoid control, integrating perception and latent action planning.",Self-collected humanoid robot episodes,"Video Visual Encoder,Cross-Attention",Llama3-70B,Token-based Motion Decoder + RL Whole-Body Ctrlr,hierarch,additive,mlp,humanoid,7.5,8.0,0.7293618801983466,0.5517167898798925,4.90020348437308,0.3507757032082798,0.6671398782416281,proxy_existing,0.0,0.32,False,False,False,0.4002839269449769,0.4305555555555556,0.4024011952037944,1.901898769326965,0.4002839269449769,0.375,0.4286747852399314,0.9509493846634828,0.5387270242120977 35 | Gemini Robotics,Yes,Yes,"General-purpose VLA built on the Gemini 2.0 foundation, enabling long-horizon dexterous manipulation across diverse robot embodiments with zero-shot adaptability.",Self-collected ALOHA2 demos + web-scale VL Dataset,Gemini 2.0 vision component,Gemini 2.0 language component,Local zero-shot policy head,late,additive,mlp,manipulation,9.0,9.0,0.5024443708913685,0.4136220004578811,5.073136838466522,0.1122884523933304,0.4453939820120637,proxy_existing,0.3,0.8400000000000001,True,False,False,0.5172363892072382,0.5,0.2078220458068894,2.0,0.5172363892072382,1.0,0.109029944368401,1.0,0.6565665833939098 36 | ECoT,Yes,Yes,"Embodied chain-of-thought planning for interpretable, stepwise VLA control.",Bridge v2,"SigLIP, DINOv2",LLaMA-2 7B,Autoregressive VLA decoder with CoT module,hierarch,additive,autoregressive,manipulation,8.0,8.0,0.6121139121150764,0.4207910090872726,5.305822709931176,-0.163697706863153,0.0697808666961356,proxy_existing,0.0,0.32,False,False,False,0.0418685200176813,0.4444444444444444,0.2575720307552611,0.0,0.0418685200176813,0.5,0.1907567176196579,0.0,0.1831563094093348 37 | OTTER,Yes,Yes,Zero-shot generalization via a frozen CLIP backbone and causal transformer action decoding.,LIBERO,Frozen CLIP ViT,CLIP text encoder,Causal transformer delta-trajectory head,early,additive,autoregressive,manipulation,7.0,7.0,0.3873813757354211,0.4030068262562456,3.787310338808125,0.0114451996703417,0.2688504680641592,proxy_existing,0.3,0.12,True,False,False,0.4113102808384955,0.3888888888888889,0.1561173387859102,2.0,0.4113102808384955,0.0,0.0240920520029775,1.0,0.3588505832103682 38 | OneTwoVLA,Yes,Yes,Unified reasoning-acting framework that dynamically toggles between planning and control via decision tokens.,Self-collected 16K reasoning-augmented robot episodes,same as pi-0 vla,same as pi-0 vla,Diffusion policy head,late,diffusion,diffusion,manipulation,7.5,7.5,0.6857394471873495,0.4018821796952228,6.351023520346339,-0.1212285745536378,0.1190921010592182,proxy_existing,0.0,0.32,False,False,False,0.0714552606355309,0.4166666666666667,0.2755864637386491,0.0,0.0714552606355309,0.25,0.2203499221444665,0.0,0.1354512956949993 39 | Helix,Yes,Yes,"First 200 Hz VLA for full humanoid control on embedded systems, enabling zero-shot task transfer.",self-collected 200Hz teleop + sim logs,Pretrained VLM,Pretrained VLM,Fast transformer policy,hierarch,additive,autoregressive,humanoid,7.5,7.5,0.4485294028643533,0.8468204737367604,4.251979757421267,0.0445625847673235,0.3409583152292508,proxy_existing,0.44999999999999996,0.44,True,False,True,0.5545749891375504,0.4166666666666667,0.3798238814184579,2.0,0.5545749891375504,0.25,0.3915859096096872,1.0,0.5490402246868094 40 | Gemini Robotics On-Device,Yes,Yes,"On-device optimized variant of Gemini VLA, delivering low-latency dual-arm and humanoid control on embedded hardware.",Self-collected ALOHA2 + few-shot adaptation demos,Gemini SDK vision module,Gemini SDK language module,On-device optimized policy head,hierarch,additive,mlp,manipulation,9.0,9.0,0.6737207967711781,0.7512978084654224,4.58960465341014,0.0578192683198506,0.3269859209233182,proxy_existing,0.0,0.44,False,False,False,0.1961915525539909,0.5,0.5061649581317643,2.0,0.1961915525539909,1.0,0.5991326764998347,1.0,0.6988310572634564 41 | OE-VLA,Yes,Yes,Curriculum-tuned LLaVA backbone with interleaved multimodal prompting for improved generalization across vision-language-action tasks.,CALVIN,SigLIP-400M ViT,Qwen-1.5 language module,MLP token generator,mid,additive,mlp,manipulation,8.0,9.0,0.4023315440944844,0.5080506459769691,5.5938358251813005,-0.0791967599127408,0.149435840997341,proxy_existing,0.0,0.12,False,False,False,0.0896615045984045,0.4722222222222222,0.2044048008741142,0.0,0.0896615045984045,0.75,0.1034162662526238,0.0,0.2357694427127571 42 | SmolVLA,Yes,Yes,"Ultra-lightweight VLA trained on community-contributed robot demonstrations, capable of real-time inference on CPU.",22.9K community episodes,SigLIP (VLM-2) visual backbone,SmolVLM2 text backbone,Chunked flow-matching head,late,flow,flow,manipulation,8.0,7.5,0.4471432034565596,0.563694615968473,5.88563264010242,0.159064194319152,0.3956658136848474,proxy_existing,0.0,0.12,False,False,False,0.2373994882109084,0.4305555555555556,0.2520522163553582,2.0,0.2373994882109084,0.375,0.1816890441255245,1.0,0.4485221330841082 43 | EF-VLA,Yes,Yes,"Early fusion of fine-grained CLIP visual tokens into the language-action pipeline, boosting zero-shot generalization.",Self-collected real and simulated tasks,Frozen CLIP ViT,Frozen CLIP text encoder,causal transformer,early,additive,autoregressive,manipulation,7.0,7.0,0.5388591101212976,0.4685851285083851,3.5869546434921373,0.0809525659268737,0.348288376475499,proxy_existing,0.3,0.24,True,False,False,0.4589730258852994,0.3888888888888889,0.2525013653641023,2.0,0.4589730258852994,0.0,0.1824268835285263,1.0,0.4103499773534564 44 | PD-VLA,Yes,No,"First parallel decoding method with action chunking for VLA, achieving a 2.52 times speed-up without sacrificing control fidelity.",Chunked trajectory demonstrations,CLIP-ViT-Large-Patch14-336 (LLaVA),Vicuna-7B-v1.5 (LLaVA),Fixed-point token predictor,late,additive,mlp,manipulation,7.0,8.0,0.7531608497915556,0.5928706949108107,5.624121359444812,0.0230567439223281,0.2551093767123281,proxy_existing,0.0,0.12,False,False,False,0.1530656260273968,0.4166666666666667,0.4465269963955363,2.0,0.1530656260273968,0.25,0.5011624322667585,1.0,0.4760570145735388 45 | LeVERB,Yes,Yes,"Dual-process latent VLA for whole-body humanoid control, achieving 58.5 % success on sim-to-real humanoid demos.",sim-to-real humanoid demos,SigLIP ViT,SigLIP text encoder,Latent CVAE verb + transformer policy,hierarch,additive,autoregressive,humanoid,8.0,8.0,0.4677651620303111,0.8752518873022228,5.220993618710439,0.0662573231807932,0.593775,paper_numeric,0.15,0.32,False,False,True,0.456265,0.4444444444444444,0.4094123408812598,2.0,0.456265,0.5,0.4401923427549412,1.0,0.5991143356887353 46 | TLA,Yes,Yes,"First language-grounded tactile-action model for high-precision contact tasks, with 85 % success on peg-in-hole task.",TLA Data,ViT (Qwen2-VL),Qwen2-VL,Multimodal $,late,additive,autoregressive,manipulation,8.0,9.0,0.7669484351875,0.4149663161823027,4.457390751517612,0.03072178458818,0.7224999999999999,paper_numeric,0.0,0.12,False,False,False,0.4334999999999999,0.4722222222222222,0.3182577668515384,2.0,0.4334999999999999,0.75,0.290448192876846,1.0,0.6184870482192115 47 | iRe-VLA,Yes,Yes,Iterative RL and supervised fine-tuning pipeline for robust control and rapid generalization across embodiments.,"Franka-Kitchen, real Panda robot demos",BLIP-2 (pre-trained VLM),BLIP-2,MLP action head after token learner,mid,additive,mlp,manipulation,8.0,8.0,0.5783937533742227,0.5723963531739436,3.638683175672992,0.1105055446043245,0.3454830423131388,proxy_existing,0.2,0.4,False,True,False,0.3572898253878833,0.4444444444444444,0.3310704751299944,2.0,0.3572898253878833,0.5,0.3114962656610799,1.0,0.5421965227622407 48 | TraceVLA,Yes,Yes,"Visual trace prompting to incorporate spatio-temporal cues, boosting task success by 3.5 time over OpenVLA.",OXE + 150K trace-annotated demos,Phi-3-Vision with trace overlay,Phi-3 LLM,Quantized delta-motion tokens,late,additive,mlp,manipulation,7.5,7.5,0.8747681572690535,0.5723812407690863,3.399380958916897,-0.0145488699954366,0.2162459829274784,proxy_existing,0.0,0.24,False,False,False,0.129747589756487,0.4166666666666667,0.500700883242948,0.0,0.129747589756487,0.25,0.5901565690712678,0.0,0.2424760397069387 49 | V-JEPA 2,Yes,No,Dual-stream self-supervised video JEPA enabling predictive planning in vision-language-action tasks.,Droid video data,ViT (self-supervised),LLM for QA/alignment,Action-conditioned transformer predictor head,late,additive,autoregressive,manipulation,8.0,7.5,0.7273879472297227,0.78388197737624,4.94189810601112,0.023234551454188,0.2868293416385221,proxy_existing,0.0,0.32,False,False,False,0.1720976049831132,0.4305555555555556,0.5701863023940791,2.0,0.1720976049831132,0.375,0.7043037210980725,1.0,0.5628503315202964 50 | Knowledge Insulating VLA,Yes,No,"Implements insulation layers between vision-language and action modules, accelerating training and inference while maintaining generalization.",Multi-domain VL datasets,PaliGemma (SigLIP),PaliGemma (Gemma-2B) encoder,Diffusion Modular policy head,late,diffusion,diffusion,manipulation,8.0,8.0,0.8562742460021622,0.701619200843473,4.50872085098678,0.1539494004845021,0.3903799473948033,proxy_existing,0.0,0.12,False,False,False,0.2342279684368819,0.4444444444444444,0.6007784521828844,2.0,0.2342279684368819,0.5,0.7545589662333779,1.0,0.622196733667565 51 | GR00T N1,Yes,No,Self-collected Diffusion-based foundation model enabling unified humanoid control with policy tokenization.,Multi-modal humanoid demonstrations,SigLIP-2 ViT (Eagle-2 VLM),SmolLM2 (Eagle-2 VLM),Generative diffusion transformer based planner,hierarch,diffusion,diffusion,humanoid,8.0,9.0,0.8496329592839016,0.7549440183147114,4.811012121791057,0.4343490004782238,0.7641774729383877,proxy_existing,0.0,0.32,False,False,False,0.4585064837630326,0.4722222222222222,0.6414253203744082,1.759362798341929,0.4585064837630326,0.75,0.8213315970896437,0.8796813991709646,0.7273798700059102 52 | AgiBot World Colosseo,Yes,No,Integrates multiple embodied datasets into a unified platform for scalable training and evaluation of VLA models.,AgiBot World Data,PaliGemma (SigLIP),PaliGemma (Gemma-2B),Latent action planner + policy head,hierarch,additive,mlp,manipulation,8.0,8.0,0.7626189584378686,0.6755677872681651,6.665700127592268,0.0166972498376189,0.2792388307289179,proxy_existing,0.0,0.32,False,False,False,0.1675432984373507,0.4444444444444444,0.5152008022806237,2.0,0.1675432984373507,0.5,0.6139763068376907,1.0,0.5703799013187603 53 | Hi Robot,Yes,No,Hierarchical separation of planning and control for open-ended instruction following in complex environments.,Self-collected Instruction-following data,PaliGemma-3B (SigLIP),PaliGemma-3B (Gemma-2B),Flow-Matching Action Expert,hierarch,flow,flow,manipulation,8.0,8.0,0.58582712374957,0.6379285578026657,4.897091309085307,0.2584483445529367,0.595117218901334,proxy_existing,0.0,0.32,False,False,False,0.3570703313408003,0.4444444444444444,0.3737158521752469,2.0,0.3570703313408003,0.5,0.3815519463534172,1.0,0.5596555694235544 54 | EnerVerse,Yes,No,"World-model LLM for predictive future-space modeling, enabling long-horizon manipulation planning.",self-collected Synthetic task fusion data,Pretrained VAE + Diffusion Generator,Tokenized instruction prompt,Diffusion Policy Head,late,diffusion,diffusion,manipulation,7.5,7.5,0.5328958801276293,0.6145952370943397,5.688954969783967,0.177573106395872,0.4660330182882503,proxy_existing,0.0,0.32,False,False,False,0.2796198109729501,0.4166666666666667,0.3275152697936372,2.0,0.2796198109729501,0.25,0.3056559531325908,1.0,0.4588189410263852 55 | FLaRe,Yes,No,"Large-scale RL fine-tuning framework generating robust, adaptive robot policies across domains.",Multi-domain RL demonstrations,DinoV2,Transformer policy (language tokens),RL policy head,late,additive,mlp,manipulation,8.0,7.5,0.3776007567066373,0.880133513833798,6.637642281000272,-0.0427204702940124,0.1871321384268374,proxy_existing,0.0,0.12,False,False,False,0.1122792830561024,0.4305555555555556,0.3323390808265137,0.0,0.1122792830561024,0.375,0.3135802672987143,0.0,0.2002148875887041 56 | Beyond Sight,Yes,No,Fuses heterogeneous sensor modalities via language-grounded attention to improve VLA generalization.,self-collected Multi-sensor data,Multi-modal ViT,"Transformer (shared, task language input)",Transformer action head,late,additive,autoregressive,manipulation,8.0,7.5,0.5914017715629405,0.5134154999230425,4.789740088954402,-0.223796240162281,0.0,proxy_existing,0.0,0.12,False,False,False,0.0,0.4305555555555556,0.30363483620236,0.0,0.0,0.375,0.2664263778215356,0.0,0.1603565944553839 57 | GeoManip,Yes,No,"Encodes geometric constraints as model interfaces, enhancing robustness and precision in manipulation.",Self-collected Simulated geometry tasks,VLM (GPT-4o) + Grounding-DINO,GPT-4o,Constraint solver head,late,additive,autoregressive,manipulation,8.0,7.5,0.8592215756878294,0.7057043991085642,5.49446633883572,-0.0042860136644593,0.226852096892811,proxy_existing,0.0,0.12,False,False,False,0.1361112581356865,0.4305555555555556,0.6063564457718934,0.0,0.1361112581356865,0.375,0.7637222135734322,0.0,0.3187083679272797 58 | Universal Actions,Yes,No,Defines a universal action dictionary to standardize policy transfer and improve cross-task adaptability.,Self-collected Cross-domain manipulation demos,Shared VLM (LLaVA-OneVion-0.5B),LLaVA,Unified action tokenizer head,mid,additive,mlp,manipulation,7.5,7.5,0.5080775514611926,0.7479414564288672,6.123886852408592,0.049215753645793,0.2821433163291065,proxy_existing,0.15,0.12,False,False,True,0.2692859897974639,0.4166666666666667,0.3800122638186971,2.0,0.2692859897974639,0.25,0.3918953747426889,1.0,0.4777953411350382 59 | RoboHorizon,Yes,No,LLM-enhanced multi-view environment modeling for robust long-horizon task planning.,Self-collected Multi-view robot trajectories,Multi-view transformer (ViT),GPT‐based planner,DreamerV2 Actor-Critic RL Head,late,additive,autoregressive,manipulation,8.0,7.5,0.358775347039228,0.844277061541385,3.855277454890607,-0.0192404348836518,0.2375113101183754,proxy_existing,0.0,0.32,False,False,False,0.1425067860710252,0.4305555555555556,0.30290579575177,0.0,0.1425067860710252,0.375,0.2652287468341456,0.0,0.1956838832262927 60 | SAM2Act,Yes,No,Utilizes SAM-based segmentation prompts with memory-augmented VLA for improved object-centric manipulation.,SAM-labeled manipulation tasks,SAM2 segmentation encoder,CLIP text encoder,Memory-augmented policy head,late,additive,mlp,manipulation,7.5,7.0,0.3454384014982286,0.4094844458593771,6.974959026767888,-0.0011549529406109,0.2300878809473218,proxy_existing,0.0,0.12,False,False,False,0.138052728568393,0.4027777777777778,0.1414516524160511,0.0,0.138052728568393,0.125,0.0,0.0,0.0657631821420982 61 | LMM Planner Integration,Yes,No,Merges LMM-based strategic planning with 3D skill policies for generalizable manipulation.,skill library demos,DINO (2D semantics) + PointNext (3D),CLIP Language Encoder,3D Transformer head,late,additive,autoregressive,manipulation,8.0,7.0,0.4179610818764548,0.8514658040981538,6.7964056674593,-0.0368246936538394,0.2170940927615814,proxy_existing,0.0,0.32,False,False,False,0.1302564556569488,0.4166666666666667,0.3558795686616698,0.0,0.1302564556569488,0.25,0.3522513968181363,0.0,0.1831269631187712 62 | VLA-Cache,Yes,No,"Introduces token-caching to reuse computation across time steps, boosting inference efficiency.",LIBERO,CLIP ViT,LLaMA-2,Cached inference head,mid,additive,autoregressive,manipulation,7.0,8.0,0.6572118829346982,0.77285784951399,6.574836967372796,0.0654751969282156,0.2989465826815463,proxy_existing,0.0,0.12,False,False,False,0.1793679496089277,0.4166666666666667,0.507931362519951,2.0,0.1793679496089277,0.25,0.6020344367928915,1.0,0.5078505966004547 63 | HAMSTER,Yes,No,Hierarchical skill decomposition to sequence multi-step manipulation actions.,Self-collected Decomposed manipulation tasks,VILA-1.5-13B,VILA-1.5-13B,Robotic View Transformer Skill execution head,hierarch,additive,autoregressive,manipulation,7.5,7.5,0.5872725958857852,0.6122794789388659,5.155750814495258,0.274606334705625,0.5786990418678443,proxy_existing,0.0,0.32,False,False,False,0.3472194251207065,0.4166666666666667,0.3595749590040237,2.0,0.3472194251207065,0.25,0.3583219982243045,1.0,0.4888853558362528 64 | TempoRep VLA,Yes,No,Use successor representation temporal encoding for compositional action planning.,Self-collected Temporal demonstration sequences,ResNet-34 CNN,retrained transformer (CLIP-style),MLP (3x256) head on ResNet feature,late,additive,mlp,manipulation,7.0,7.0,0.3710916043128029,0.8319275862941342,5.733365924994738,-0.1718310310918341,0.0603372017239277,proxy_existing,0.0,0.32,False,False,False,0.0362023210343566,0.3888888888888889,0.308721342669968,0.0,0.0362023210343566,0.0,0.27478223483638,0.0,0.0777461389676841 65 | ConRFT,Yes,No,Applies consistency regularized fine-tuning with reinforcement for stable policy learning.,Self-collected data for fine-tuning,same as in octo,same as in octo,Reinforced policy head,late,additive,mlp,manipulation,7.5,7.5,0.4512695045083348,0.4605643162990613,5.266195097623432,-0.20518428655428,0.0192344601461592,proxy_existing,0.0,0.12,False,False,False,0.0115406760876955,0.4166666666666667,0.2078386308104974,0.0,0.0115406760876955,0.25,0.1090571893782367,0.0,0.092649466366483 66 | RoboBERT,Yes,No,"Unified multimodal Transformer for end-to-end vision-language-action manipulation, pre-trained on diverse robot and language data.",Self-collected Multi-domain robot demos,CLIP ViT,BERT-base,CNN-based Diffusion Policy Head,late,diffusion,diffusion,manipulation,7.0,7.5,0.73429447311975,0.5937061404903248,3.310947712946261,0.1093259225687766,0.3442639659330793,proxy_existing,0.0,0.12,False,False,False,0.2065583795598475,0.4027777777777778,0.4359551376193033,2.0,0.2065583795598475,0.125,0.4837955143481676,1.0,0.4538384734770038 67 | Diffusion Transformer Policy,Yes,No,"Adapts diffusion-based transformer architectures to VLA policy learning, enabling robust multimodal action sampling.",LIBERO + CALVIN,DINOv2,CLIP Text Encoder,Diffusion generator head,late,diffusion,diffusion,manipulation,8.0,7.0,0.586296483646221,0.5789505084739845,3.1318136039628754,0.4095749160585827,0.6545552669344674,proxy_existing,0.0,0.24,False,False,False,0.3927331601606804,0.4166666666666667,0.3394366473234888,1.598133189486742,0.3927331601606804,0.25,0.3252397925897463,0.799066594743371,0.4417598868734494 68 | GEVRM,Yes,No,Generative video modeling of goal-oriented tasks to enhance planning for visual manipulation.,CALVIN,ResNet-34,T5 Encoder,Diffusion Policy,late,diffusion,diffusion,manipulation,7.0,8.0,0.6167495034897343,0.8037992235722431,3.906151656571228,-0.0823315495698903,0.1642557342839666,proxy_existing,0.0,0.32,False,False,False,0.0985534405703799,0.4166666666666667,0.4957427720436149,0.0,0.0985534405703799,0.25,0.5820116333459017,0.0,0.2326412684790704 69 | SoFar,Yes,No,Introduces successor-feature orientation representations bridging spatial reasoning and robotic manipulation.,Self-collected Orientation task demonstrations,"Florence-2 (ViT-style), SAM",CLIP Text Encode,"VLM (e.g., LLaVA or GPT-4o) for 6D goal pose, then motion planner",hierarch,additive,planner,manipulation,8.0,7.0,0.3489170139299367,0.8896439013460229,5.111842673628743,0.0156524486050095,0.2780257040262446,proxy_existing,0.0,0.32,False,False,False,0.1668154224157467,0.4166666666666667,0.3104118935186334,2.0,0.1668154224157467,0.25,0.2775593867495308,1.0,0.4235937022913194 70 | ARM4R,Yes,No,Auto-regressive 4D transition model for predicting and planning manipulator trajectories.,76K videos from the Epic-Kitchens100 dataset,ViT-Base,CLIP text encoder,2-layer MLP,late,additive,mlp,manipulation,8.0,7.0,0.8529551794647776,0.6817319807119595,4.234840949051182,0.0130965168287242,0.2750579920909524,proxy_existing,0.0,0.32,False,False,False,0.1650347952545714,0.4166666666666667,0.5814868239550477,2.0,0.1650347952545714,0.25,0.7228676495935815,1.0,0.5344756112120382 71 | Magma,Yes,No,"Foundation multimodal agent model unifying vision, language, and action domains for end-to-end control.",Self-collected Multimodal interaction dataset,ConvNeXt-XXlarge,LLaMA-3-8B (decoder-only LLM),Decoder-Only LLM Head (LLaMA-3-8B),late,additive,autoregressive,manipulation,7.5,8.0,0.6613608069549721,0.6110504529653061,4.490546076315089,0.1325030261465944,0.3682162647868279,proxy_existing,0.0,0.12,False,False,False,0.2209297588720967,0.4305555555555556,0.4041248206633361,2.0,0.2209297588720967,0.375,0.4315062704604443,1.0,0.5068590073331353 72 | An Atomic Skill Library,Yes,No,"Constructs an atomic skill library for modular, data-efficient composition of robotic actions.",Self-collected Skill primitive demonstrations,"Prismatic VLM (scene description.), DINO-X (obj detection), SAM-2 (segmentation)","Prismatic, GPT-4 (for planning)",Skill executor module,late,additive,autoregressive,manipulation,8.0,7.5,0.368608015236915,0.8657175772923327,3.2336294870169024,0.1079079756737837,0.3427985935377072,proxy_existing,0.0,0.12,False,False,False,0.2056791561226243,0.4305555555555556,0.3191104379214373,2.0,0.2056791561226243,0.375,0.2918489180280164,1.0,0.4681320185376602 73 | RoboBrain,Yes,No,Knowledge-grounded policy brain that maps abstract high-level plans to concrete multimodal actions across diverse tasks.,Multi-domain robot and plan data,SigLIPr,Qwen2.5-7B-Instruct (decoder-only LLM),LoRA adapters for skill,mid,additive,autoregressive,manipulation,8.0,9.0,0.724751161058266,0.7818470241237657,5.372459221373023,-0.0810376702748838,0.1475333584455862,proxy_existing,0.0,0.24,False,False,False,0.0885200150673517,0.4722222222222222,0.5666445385036493,0.0,0.0885200150673517,0.75,0.6984854895009568,0.0,0.3842513761420771 74 | SafeVLA,Yes,No,"Safety-aware VLA integrating constraint feedback through safe RL to ensure collision-free, reliable manipulation.",Safety-scenario demonstrations,"Modular (DINOv2, SigLIP, CLIP)","LLM (model-agnostic, e.g., T5, LLaMA, Qwen)",Safety-constraint policy head,late,additive,mlp,manipulation,7.0,8.0,0.62882291172083,0.606375032211623,4.553138862416713,-0.2176644525965327,0.0063368750020631,proxy_existing,0.0,0.12,False,False,False,0.0038021250012378,0.4166666666666667,0.3813025133501249,0.0,0.0038021250012378,0.25,0.3940149317838008,0.0,0.1619542641962596 75 | Diffusion-VLA,Yes,Yes,"Multimodal VLA framework unifying vision-language reasoning with diffusion-based policy for robust, generalizable manipulation across diverse robot embodiments.",Multi-embodiment manipulation suites,SigLIP,Qwen2-VL (2B/7B/72B),Latent diffusion policy head + MLP,late,diffusion,diffusion,manipulation,8.0,9.0,0.3855532100040863,0.881294561592872,3.682033560676781,0.3702238675017123,0.6463880157586654,proxy_existing,0.2,0.52,False,True,False,0.5378328094551993,0.4722222222222222,0.3397859471812757,1.745938261950861,0.5378328094551993,0.75,0.3258136048291605,0.8729691309754305,0.6216538863149476 76 | -------------------------------------------------------------------------------- /Plot_script/plots/scale_analysis_adjusted_4panel.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 2025-11-18T00:03:19.487250 10 | image/svg+xml 11 | 12 | 13 | Matplotlib v3.10.7, https://matplotlib.org/ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 30 | 31 | 32 | 33 | 39 | 40 | 41 | 44 | 45 | 46 | 49 | 50 | 51 | 54 | 55 | 56 | 59 | 60 | 61 | 64 | 65 | 66 | 67 | 68 | Small 69 | 70 | 71 | 72 | 73 | Medium 74 | 75 | 76 | 77 | 78 | Large 79 | 80 | 81 | 82 | Vision Model Size 83 | 84 | 85 | 86 | 87 | 88 | 91 | 92 | 93 | 0.0 94 | 95 | 96 | 97 | 98 | 101 | 102 | 103 | 0.2 104 | 105 | 106 | 107 | 108 | 111 | 112 | 113 | 0.4 114 | 115 | 116 | 117 | 118 | 121 | 122 | 123 | 0.6 124 | 125 | 126 | 127 | 128 | 131 | 132 | 133 | 0.8 134 | 135 | 136 | 137 | 138 | 141 | 142 | 143 | 1.0 144 | 145 | 146 | 147 | Adjusted Success (0-1) 148 | 149 | 150 | 151 | 158 | 159 | 160 | 163 | 164 | 165 | 168 | 169 | 170 | 173 | 174 | 175 | 178 | 179 | 180 | 181 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 206 | 207 | 208 | 211 | 212 | 213 | 216 | 217 | 218 | 221 | 222 | 223 | 226 | 227 | 228 | 229 | 236 | 237 | 238 | 241 | 242 | 243 | 246 | 247 | 248 | 251 | 252 | 253 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 266 | 267 | 268 | 271 | 272 | 273 | 276 | 277 | 278 | 281 | 282 | 283 | 286 | 287 | 288 | Vision Model Scale Impact 289 | 290 | 291 | 292 | 293 | 299 | 300 | 301 | 304 | 305 | 306 | 309 | 310 | 311 | 314 | 315 | 316 | 319 | 320 | 321 | 324 | 325 | 326 | 327 | 328 | Small 329 | 330 | 331 | 332 | 333 | Medium 334 | 335 | 336 | 337 | 338 | Large 339 | 340 | 341 | 342 | Language Model Size 343 | 344 | 345 | 346 | 347 | 348 | 351 | 352 | 353 | 0.0 354 | 355 | 356 | 357 | 358 | 361 | 362 | 363 | 0.2 364 | 365 | 366 | 367 | 368 | 371 | 372 | 373 | 0.4 374 | 375 | 376 | 377 | 378 | 381 | 382 | 383 | 0.6 384 | 385 | 386 | 387 | 388 | 391 | 392 | 393 | 0.8 394 | 395 | 396 | 397 | 398 | 401 | 402 | 403 | 1.0 404 | 405 | 406 | 407 | Adjusted Success (0-1) 408 | 409 | 410 | 411 | 418 | 419 | 420 | 423 | 424 | 425 | 428 | 429 | 430 | 433 | 434 | 435 | 438 | 439 | 440 | 441 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 469 | 470 | 471 | 474 | 475 | 476 | 479 | 480 | 481 | 484 | 485 | 486 | 489 | 490 | 491 | 492 | 499 | 500 | 501 | 504 | 505 | 506 | 509 | 510 | 511 | 514 | 515 | 516 | 519 | 520 | 521 | 522 | 525 | 526 | 527 | 530 | 531 | 532 | 535 | 536 | 537 | 540 | 541 | 542 | 545 | 546 | 547 | Language Model Scale Impact 548 | 549 | 550 | 551 | 552 | 558 | 559 | 560 | 563 | 564 | 565 | 568 | 569 | 570 | 573 | 574 | 575 | 578 | 579 | 580 | 583 | 584 | 585 | 586 | 587 | early 588 | 589 | 590 | 591 | 592 | late 593 | 594 | 595 | 596 | 597 | hierarch 598 | 599 | 600 | 601 | Fusion Depth 602 | 603 | 604 | 605 | 606 | 607 | 610 | 611 | 612 | 0.0 613 | 614 | 615 | 616 | 617 | 620 | 621 | 622 | 0.2 623 | 624 | 625 | 626 | 627 | 630 | 631 | 632 | 0.4 633 | 634 | 635 | 636 | 637 | 640 | 641 | 642 | 0.6 643 | 644 | 645 | 646 | 647 | 650 | 651 | 652 | 0.8 653 | 654 | 655 | 656 | 657 | 660 | 661 | 662 | 1.0 663 | 664 | 665 | 666 | Adjusted Success (0-1) 667 | 668 | 669 | 670 | 677 | 678 | 679 | 682 | 683 | 684 | 687 | 688 | 689 | 692 | 693 | 694 | 697 | 698 | 699 | 700 | 707 | 708 | 709 | 712 | 713 | 714 | 717 | 718 | 719 | 722 | 723 | 724 | 727 | 728 | 729 | 730 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 758 | 759 | 760 | 763 | 764 | 765 | 768 | 769 | 770 | 773 | 774 | 775 | 778 | 779 | 780 | 781 | 784 | 785 | 786 | 789 | 790 | 791 | 794 | 795 | 796 | 799 | 800 | 801 | 804 | 805 | 806 | Fusion Depth Impact 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | --------------------------------------------------------------------------------