├── .gitignore ├── README.md ├── abstract.py ├── articles_summary.json ├── config_example.py ├── example ├── 6月第4周AI周报.pdf ├── 7月第1周AI周报.pdf ├── 7月第2周AI周报.pdf ├── 7月第3周AI周报.pdf └── 7月第4周AI周报.pdf ├── extracted_data.json ├── font ├── SmileySans-Oblique.ttf └── SmileySans-Oblique.ttf.woff2 ├── main.py ├── main_train.py ├── news_template.html ├── news_template_1.html ├── output.html ├── papers.json ├── requirements.txt ├── retrieval.py ├── save_data.py ├── show_pdf.py ├── spider.py ├── template_image ├── card.jpg └── logo.jpg ├── total_papers.json ├── unique_article.json ├── unique_data.json ├── unique_data.py └── write_to_html.py /.gitignore: -------------------------------------------------------------------------------- 1 | /venv/ 2 | /train_paper_model/ 3 | /pdf_files/ 4 | config.py 5 | /top_half_images/ 6 | /.idea 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI Weekly Papers 2 | 3 | > 特工宇宙搭建的自动化AI领域论文周报解决方案，感谢Kimi(月之暗面)提供的算力支持。 4 | 5 | 6 | 7 | [![Forks][forks-shield]][forks-url] 8 | [![Stargazers][stars-shield]][stars-url] 9 | [![Issues][issues-shield]][issues-url] 10 | 11 | 12 | 13 | 14 |
15 | 16 |

AI Weekly Papers

17 |

18 |
19 | 报告Bug 20 | · 21 | 提出新特性 22 |

23 | 24 |

特工宇宙 x Kimi AI论文简报

163 |

164 |

{{ time }} 论文导读

机器学习 (ML)

226 | {% for article in articles %} 227 | {% if article.tag == 'ML' %} 228 |

229 |

230 | {% if article.arxiv_link %} 231 |

ArXiv链接:{{ article.arxiv_link }}

232 | {% endif %} 233 | {% if article.top_half_image_path %} 234 |

235 | {% endif %} 236 |

237 |

238 |

239 |

240 | {% endif %} 241 | {% endfor %} 242 |

243 | 244 |

245 |

深度学习 (DL)

246 | {% for article in articles %} 247 | {% if article.tag == 'DL' %} 248 |

249 |

250 | {% if article.arxiv_link %} 251 |

ArXiv链接:{{ article.arxiv_link }}

252 | {% endif %} 253 | {% if article.top_half_image_path %} 254 |

255 | {% endif %} 256 |

257 |

258 |

259 |

260 | {% endif %} 261 | {% endfor %} 262 |

263 |

264 |

自然语言处理 (NLP)

265 | {% for article in articles %} 266 | {% if article.tag == 'NLP' %} 267 |

268 |

269 | {% if article.arxiv_link %} 270 |

ArXiv链接:{{ article.arxiv_link }}

271 | {% endif %} 272 | {% if article.top_half_image_path %} 273 |

274 | {% endif %} 275 |

276 |

277 |

278 |

279 | {% endif %} 280 | {% endfor %} 281 |

282 |

283 |

计算机视觉 (CV)

284 | {% for article in articles %} 285 | {% if article.tag == 'CV' %} 286 |

287 |

288 | {% if article.arxiv_link %} 289 |

ArXiv链接:{{ article.arxiv_link }}

290 | {% endif %} 291 | {% if article.top_half_image_path %} 292 |

293 | {% endif %} 294 |

295 |

296 |

297 |

298 | {% endif %} 299 | {% endfor %} 300 |

301 |

302 |

智能系统和应用 (ISA)

303 | {% for article in articles %} 304 | {% if article.tag == 'ISA' %} 305 |

306 |

307 | {% if article.arxiv_link %} 308 |

ArXiv链接:{{ article.arxiv_link }}

309 | {% endif %} 310 | {% if article.top_half_image_path %} 311 |

312 | {% endif %} 313 |

314 |

315 |

316 |

317 | {% endif %} 318 | {% endfor %} 319 |

320 |

321 | 322 |

323 |

324 |

325 |

326 |

论文简报Agent由“特工宇宙”搭建

327 |

加入社群获得最新论文简报及PDF版本

328 |

底层模型由月之暗面公司Kimi支持

329 |

330 | 331 | 332 | -------------------------------------------------------------------------------- /news_template_1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 特工宇宙AI论文简报 7 | 107 | 108 | 109 |

110 |

111 |

112 |

113 |

特工宇宙AI论文简报

114 |

115 |

{{ time }} 论文导读

116 |

117 |

118 | {% for article in articles %} 119 |

120 |

121 | {% if article.arxiv_link %} 122 |

ArXiv链接: {{ article.arxiv_link }}

123 | {% endif %} 124 | {% if article.top_half_image_path %} 125 |

126 | {% endif %} 127 |

128 |

129 | {% endfor %} 130 |

131 |

132 |

133 |

134 |

135 |

加入社群获得每日日报及PDF版本

136 |

137 | 138 | 139 | -------------------------------------------------------------------------------- /papers.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents", 4 | "Link": "https://huggingface.co/papers/2407.18901", 5 | "Upvotes": "3", 6 | "Date": "2024-07-29" 7 | }, 8 | { 9 | "Title": "Floating No More: Object-Ground Reconstruction from a Single Image", 10 | "Link": "https://huggingface.co/papers/2407.18914", 11 | "Upvotes": "2", 12 | "Date": "2024-07-29" 13 | }, 14 | { 15 | "Title": "Wolf: Captioning Everything with a World Summarization Framework", 16 | "Link": "https://huggingface.co/papers/2407.18908", 17 | "Upvotes": "1", 18 | "Date": "2024-07-29" 19 | }, 20 | { 21 | "Title": "SHIC: Shape-Image Correspondences with no Keypoint Supervision", 22 | "Link": "https://huggingface.co/papers/2407.18907", 23 | "Upvotes": "1", 24 | "Date": "2024-07-29" 25 | }, 26 | { 27 | "Title": "Lessons from Learning to Spin \"Pens\"", 28 | "Link": "https://huggingface.co/papers/2407.18902", 29 | "Upvotes": "-", 30 | "Date": "2024-07-29" 31 | }, 32 | { 33 | "Title": "VSSD: Vision Mamba with Non-Casual State Space Duality", 34 | "Link": "https://huggingface.co/papers/2407.18559", 35 | "Upvotes": "-", 36 | "Date": "2024-07-29" 37 | }, 38 | { 39 | "Title": "Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model", 40 | "Link": "https://huggingface.co/papers/2407.16982", 41 | "Upvotes": "33", 42 | "Date": "2024-07-28" 43 | }, 44 | { 45 | "Title": "LAMBDA: A Large Model Based Data Agent", 46 | "Link": "https://huggingface.co/papers/2407.17535", 47 | "Upvotes": "26", 48 | "Date": "2024-07-28" 49 | }, 50 | { 51 | "Title": "AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents", 52 | "Link": "https://huggingface.co/papers/2407.17490", 53 | "Upvotes": "24", 54 | "Date": "2024-07-28" 55 | }, 56 | { 57 | "Title": "Very Large-Scale Multi-Agent Simulation in AgentScope", 58 | "Link": "https://huggingface.co/papers/2407.17789", 59 | "Upvotes": "18", 60 | "Date": "2024-07-28" 61 | }, 62 | { 63 | "Title": "BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular Depth Estimation", 64 | "Link": "https://huggingface.co/papers/2407.17952", 65 | "Upvotes": "17", 66 | "Date": "2024-07-28" 67 | }, 68 | { 69 | "Title": "Course-Correction: Safety Alignment Using Synthetic Preferences", 70 | "Link": "https://huggingface.co/papers/2407.16637", 71 | "Upvotes": "16", 72 | "Date": "2024-07-28" 73 | }, 74 | { 75 | "Title": "Data Mixture Inference: What do BPE Tokenizers Reveal about their Training Data?", 76 | "Link": "https://huggingface.co/papers/2407.16607", 77 | "Upvotes": "15", 78 | "Date": "2024-07-28" 79 | }, 80 | { 81 | "Title": "Efficient Inference of Vision Instruction-Following Models with Elastic Cache", 82 | "Link": "https://huggingface.co/papers/2407.18121", 83 | "Upvotes": "13", 84 | "Date": "2024-07-28" 85 | }, 86 | { 87 | "Title": "LKCell: Efficient Cell Nuclei Instance Segmentation with Large Convolution Kernels", 88 | "Link": "https://huggingface.co/papers/2407.18054", 89 | "Upvotes": "7", 90 | "Date": "2024-07-28" 91 | }, 92 | { 93 | "Title": "Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic", 94 | "Link": "https://huggingface.co/papers/2407.18129", 95 | "Upvotes": "7", 96 | "Date": "2024-07-28" 97 | }, 98 | { 99 | "Title": "The FIGNEWS Shared Task on News Media Narratives", 100 | "Link": "https://huggingface.co/papers/2407.18147", 101 | "Upvotes": "6", 102 | "Date": "2024-07-28" 103 | }, 104 | { 105 | "Title": "Text-Driven Neural Collaborative Filtering Model for Paper Source Tracing", 106 | "Link": "https://huggingface.co/papers/2407.17722", 107 | "Upvotes": "3", 108 | "Date": "2024-07-28" 109 | }, 110 | { 111 | "Title": "Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model", 112 | "Link": "https://huggingface.co/papers/2407.16982", 113 | "Upvotes": "33", 114 | "Date": "2024-07-27" 115 | }, 116 | { 117 | "Title": "LAMBDA: A Large Model Based Data Agent", 118 | "Link": "https://huggingface.co/papers/2407.17535", 119 | "Upvotes": "26", 120 | "Date": "2024-07-27" 121 | }, 122 | { 123 | "Title": "AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents", 124 | "Link": "https://huggingface.co/papers/2407.17490", 125 | "Upvotes": "24", 126 | "Date": "2024-07-27" 127 | }, 128 | { 129 | "Title": "Very Large-Scale Multi-Agent Simulation in AgentScope", 130 | "Link": "https://huggingface.co/papers/2407.17789", 131 | "Upvotes": "18", 132 | "Date": "2024-07-27" 133 | }, 134 | { 135 | "Title": "BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular Depth Estimation", 136 | "Link": "https://huggingface.co/papers/2407.17952", 137 | "Upvotes": "17", 138 | "Date": "2024-07-27" 139 | }, 140 | { 141 | "Title": "Course-Correction: Safety Alignment Using Synthetic Preferences", 142 | "Link": "https://huggingface.co/papers/2407.16637", 143 | "Upvotes": "16", 144 | "Date": "2024-07-27" 145 | }, 146 | { 147 | "Title": "Data Mixture Inference: What do BPE Tokenizers Reveal about their Training Data?", 148 | "Link": "https://huggingface.co/papers/2407.16607", 149 | "Upvotes": "15", 150 | "Date": "2024-07-27" 151 | }, 152 | { 153 | "Title": "Efficient Inference of Vision Instruction-Following Models with Elastic Cache", 154 | "Link": "https://huggingface.co/papers/2407.18121", 155 | "Upvotes": "13", 156 | "Date": "2024-07-27" 157 | }, 158 | { 159 | "Title": "LKCell: Efficient Cell Nuclei Instance Segmentation with Large Convolution Kernels", 160 | "Link": "https://huggingface.co/papers/2407.18054", 161 | "Upvotes": "7", 162 | "Date": "2024-07-27" 163 | }, 164 | { 165 | "Title": "Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic", 166 | "Link": "https://huggingface.co/papers/2407.18129", 167 | "Upvotes": "7", 168 | "Date": "2024-07-27" 169 | }, 170 | { 171 | "Title": "The FIGNEWS Shared Task on News Media Narratives", 172 | "Link": "https://huggingface.co/papers/2407.18147", 173 | "Upvotes": "6", 174 | "Date": "2024-07-27" 175 | }, 176 | { 177 | "Title": "Text-Driven Neural Collaborative Filtering Model for Paper Source Tracing", 178 | "Link": "https://huggingface.co/papers/2407.17722", 179 | "Upvotes": "3", 180 | "Date": "2024-07-27" 181 | }, 182 | { 183 | "Title": "Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model", 184 | "Link": "https://huggingface.co/papers/2407.16982", 185 | "Upvotes": "33", 186 | "Date": "2024-07-26" 187 | }, 188 | { 189 | "Title": "LAMBDA: A Large Model Based Data Agent", 190 | "Link": "https://huggingface.co/papers/2407.17535", 191 | "Upvotes": "26", 192 | "Date": "2024-07-26" 193 | }, 194 | { 195 | "Title": "AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents", 196 | "Link": "https://huggingface.co/papers/2407.17490", 197 | "Upvotes": "24", 198 | "Date": "2024-07-26" 199 | }, 200 | { 201 | "Title": "Very Large-Scale Multi-Agent Simulation in AgentScope", 202 | "Link": "https://huggingface.co/papers/2407.17789", 203 | "Upvotes": "18", 204 | "Date": "2024-07-26" 205 | }, 206 | { 207 | "Title": "BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular Depth Estimation", 208 | "Link": "https://huggingface.co/papers/2407.17952", 209 | "Upvotes": "17", 210 | "Date": "2024-07-26" 211 | }, 212 | { 213 | "Title": "Course-Correction: Safety Alignment Using Synthetic Preferences", 214 | "Link": "https://huggingface.co/papers/2407.16637", 215 | "Upvotes": "16", 216 | "Date": "2024-07-26" 217 | }, 218 | { 219 | "Title": "Data Mixture Inference: What do BPE Tokenizers Reveal about their Training Data?", 220 | "Link": "https://huggingface.co/papers/2407.16607", 221 | "Upvotes": "15", 222 | "Date": "2024-07-26" 223 | }, 224 | { 225 | "Title": "Efficient Inference of Vision Instruction-Following Models with Elastic Cache", 226 | "Link": "https://huggingface.co/papers/2407.18121", 227 | "Upvotes": "13", 228 | "Date": "2024-07-26" 229 | }, 230 | { 231 | "Title": "LKCell: Efficient Cell Nuclei Instance Segmentation with Large Convolution Kernels", 232 | "Link": "https://huggingface.co/papers/2407.18054", 233 | "Upvotes": "7", 234 | "Date": "2024-07-26" 235 | }, 236 | { 237 | "Title": "Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic", 238 | "Link": "https://huggingface.co/papers/2407.18129", 239 | "Upvotes": "7", 240 | "Date": "2024-07-26" 241 | }, 242 | { 243 | "Title": "The FIGNEWS Shared Task on News Media Narratives", 244 | "Link": "https://huggingface.co/papers/2407.18147", 245 | "Upvotes": "6", 246 | "Date": "2024-07-26" 247 | }, 248 | { 249 | "Title": "Text-Driven Neural Collaborative Filtering Model for Paper Source Tracing", 250 | "Link": "https://huggingface.co/papers/2407.17722", 251 | "Upvotes": "3", 252 | "Date": "2024-07-26" 253 | }, 254 | { 255 | "Title": "OpenDevin: An Open Platform for AI Software Developers as Generalist Agents", 256 | "Link": "https://huggingface.co/papers/2407.16741", 257 | "Upvotes": "56", 258 | "Date": "2024-07-25" 259 | }, 260 | { 261 | "Title": "$VILA^2$: VILA Augmented VILA", 262 | "Link": "https://huggingface.co/papers/2407.17453", 263 | "Upvotes": "33", 264 | "Date": "2024-07-25" 265 | }, 266 | { 267 | "Title": "HumanVid: Demystifying Training Data for Camera-controllable Human Image Animation", 268 | "Link": "https://huggingface.co/papers/2407.17438", 269 | "Upvotes": "19", 270 | "Date": "2024-07-25" 271 | }, 272 | { 273 | "Title": "DDK: Distilling Domain Knowledge for Efficient Large Language Models", 274 | "Link": "https://huggingface.co/papers/2407.16154", 275 | "Upvotes": "16", 276 | "Date": "2024-07-25" 277 | }, 278 | { 279 | "Title": "PERSONA: A Reproducible Testbed for Pluralistic Alignment", 280 | "Link": "https://huggingface.co/papers/2407.17387", 281 | "Upvotes": "15", 282 | "Date": "2024-07-25" 283 | }, 284 | { 285 | "Title": "Longhorn: State Space Models are Amortized Online Learners", 286 | "Link": "https://huggingface.co/papers/2407.14207", 287 | "Upvotes": "14", 288 | "Date": "2024-07-25" 289 | }, 290 | { 291 | "Title": "SV4D: Dynamic 3D Content Generation with Multi-Frame and Multi-View Consistency", 292 | "Link": "https://huggingface.co/papers/2407.17470", 293 | "Upvotes": "12", 294 | "Date": "2024-07-25" 295 | }, 296 | { 297 | "Title": "Learning to Manipulate Anywhere: A Visual Generalizable Framework For Reinforcement Learning", 298 | "Link": "https://huggingface.co/papers/2407.15815", 299 | "Upvotes": "10", 300 | "Date": "2024-07-25" 301 | }, 302 | { 303 | "Title": "ViPer: Visual Personalization of Generative Models via Individual Preference Learning", 304 | "Link": "https://huggingface.co/papers/2407.17365", 305 | "Upvotes": "10", 306 | "Date": "2024-07-25" 307 | }, 308 | { 309 | "Title": "MOMAland: A Set of Benchmarks for Multi-Objective Multi-Agent Reinforcement Learning", 310 | "Link": "https://huggingface.co/papers/2407.16312", 311 | "Upvotes": "9", 312 | "Date": "2024-07-25" 313 | }, 314 | { 315 | "Title": "Scalify: scale propagation for efficient low-precision LLM training", 316 | "Link": "https://huggingface.co/papers/2407.17353", 317 | "Upvotes": "9", 318 | "Date": "2024-07-25" 319 | }, 320 | { 321 | "Title": "DistilDIRE: A Small, Fast, Cheap and Lightweight Diffusion Synthesized Deepfake Detection", 322 | "Link": "https://huggingface.co/papers/2406.00856", 323 | "Upvotes": "8", 324 | "Date": "2024-07-25" 325 | }, 326 | { 327 | "Title": "DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car Reconstruction", 328 | "Link": "https://huggingface.co/papers/2407.16988", 329 | "Upvotes": "6", 330 | "Date": "2024-07-25" 331 | }, 332 | { 333 | "Title": "CoD, Towards an Interpretable Medical Agent using Chain of Diagnosis", 334 | "Link": "https://huggingface.co/papers/2407.13301", 335 | "Upvotes": "52", 336 | "Date": "2024-07-24" 337 | }, 338 | { 339 | "Title": "KAN or MLP: A Fairer Comparison", 340 | "Link": "https://huggingface.co/papers/2407.16674", 341 | "Upvotes": "34", 342 | "Date": "2024-07-24" 343 | }, 344 | { 345 | "Title": "MovieDreamer: Hierarchical Generation for Coherent Long Visual Sequence", 346 | "Link": "https://huggingface.co/papers/2407.16655", 347 | "Upvotes": "25", 348 | "Date": "2024-07-24" 349 | }, 350 | { 351 | "Title": "T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video Generation", 352 | "Link": "https://huggingface.co/papers/2407.14505", 353 | "Upvotes": "21", 354 | "Date": "2024-07-24" 355 | }, 356 | { 357 | "Title": "OutfitAnyone: Ultra-high Quality Virtual Try-On for Any Clothing and Any Person", 358 | "Link": "https://huggingface.co/papers/2407.16224", 359 | "Upvotes": "20", 360 | "Date": "2024-07-24" 361 | }, 362 | { 363 | "Title": "INF-LLaVA: Dual-perspective Perception for High-Resolution Multimodal Large Language Model", 364 | "Link": "https://huggingface.co/papers/2407.16198", 365 | "Upvotes": "12", 366 | "Date": "2024-07-24" 367 | }, 368 | { 369 | "Title": "F-HOI: Toward Fine-grained Semantic-Aligned 3D Human-Object Interactions", 370 | "Link": "https://huggingface.co/papers/2407.12435", 371 | "Upvotes": "10", 372 | "Date": "2024-07-24" 373 | }, 374 | { 375 | "Title": "A Simulation Benchmark for Autonomous Racing with Large-Scale Human Data", 376 | "Link": "https://huggingface.co/papers/2407.16680", 377 | "Upvotes": "9", 378 | "Date": "2024-07-24" 379 | }, 380 | { 381 | "Title": "SIGMA: Sinkhorn-Guided Masked Video Modeling", 382 | "Link": "https://huggingface.co/papers/2407.15447", 383 | "Upvotes": "5", 384 | "Date": "2024-07-24" 385 | }, 386 | { 387 | "Title": "PrimeGuard: Safe and Helpful LLMs through Tuning-Free Routing", 388 | "Link": "https://huggingface.co/papers/2407.16318", 389 | "Upvotes": "4", 390 | "Date": "2024-07-24" 391 | }, 392 | { 393 | "Title": "Cross Anything: General Quadruped Robot Navigation through Complex Terrains", 394 | "Link": "https://huggingface.co/papers/2407.16412", 395 | "Upvotes": "3", 396 | "Date": "2024-07-24" 397 | }, 398 | { 399 | "Title": "SlowFast-LLaVA: A Strong Training-Free Baseline for Video Large Language Models", 400 | "Link": "https://huggingface.co/papers/2407.15841", 401 | "Upvotes": "32", 402 | "Date": "2024-07-23" 403 | }, 404 | { 405 | "Title": "NNsight and NDIF: Democratizing Access to Foundation Model Internals", 406 | "Link": "https://huggingface.co/papers/2407.14561", 407 | "Upvotes": "32", 408 | "Date": "2024-07-23" 409 | }, 410 | { 411 | "Title": "Knowledge Mechanisms in Large Language Models: A Survey and Perspective", 412 | "Link": "https://huggingface.co/papers/2407.15017", 413 | "Upvotes": "31", 414 | "Date": "2024-07-23" 415 | }, 416 | { 417 | "Title": "Compact Language Models via Pruning and Knowledge Distillation", 418 | "Link": "https://huggingface.co/papers/2407.14679", 419 | "Upvotes": "29", 420 | "Date": "2024-07-23" 421 | }, 422 | { 423 | "Title": "POGEMA: A Benchmark Platform for Cooperative Multi-Agent Navigation", 424 | "Link": "https://huggingface.co/papers/2407.14931", 425 | "Upvotes": "18", 426 | "Date": "2024-07-23" 427 | }, 428 | { 429 | "Title": "VideoGameBunny: Towards vision assistants for video games", 430 | "Link": "https://huggingface.co/papers/2407.15295", 431 | "Upvotes": "18", 432 | "Date": "2024-07-23" 433 | }, 434 | { 435 | "Title": "LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding", 436 | "Link": "https://huggingface.co/papers/2407.15754", 437 | "Upvotes": "16", 438 | "Date": "2024-07-23" 439 | }, 440 | { 441 | "Title": "BoostMVSNeRFs: Boosting MVS-based NeRFs to Generalizable View Synthesis in Large-scale Scenes", 442 | "Link": "https://huggingface.co/papers/2407.15848", 443 | "Upvotes": "15", 444 | "Date": "2024-07-23" 445 | }, 446 | { 447 | "Title": "BOND: Aligning LLMs with Best-of-N Distillation", 448 | "Link": "https://huggingface.co/papers/2407.14622", 449 | "Upvotes": "11", 450 | "Date": "2024-07-23" 451 | }, 452 | { 453 | "Title": "Consent in Crisis: The Rapid Decline of the AI Data Commons", 454 | "Link": "https://huggingface.co/papers/2407.14933", 455 | "Upvotes": "9", 456 | "Date": "2024-07-23" 457 | }, 458 | { 459 | "Title": "Artist: Aesthetically Controllable Text-Driven Stylization without Training", 460 | "Link": "https://huggingface.co/papers/2407.15842", 461 | "Upvotes": "9", 462 | "Date": "2024-07-23" 463 | }, 464 | { 465 | "Title": "Cinemo: Consistent and Controllable Image Animation with Motion Diffusion Models", 466 | "Link": "https://huggingface.co/papers/2407.15642", 467 | "Upvotes": "9", 468 | "Date": "2024-07-23" 469 | }, 470 | { 471 | "Title": "HoloDreamer: Holistic 3D Panoramic World Generation from Text Descriptions", 472 | "Link": "https://huggingface.co/papers/2407.15187", 473 | "Upvotes": "9", 474 | "Date": "2024-07-23" 475 | }, 476 | { 477 | "Title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?", 478 | "Link": "https://huggingface.co/papers/2407.15711", 479 | "Upvotes": "7", 480 | "Date": "2024-07-23" 481 | }, 482 | { 483 | "Title": "MusiConGen: Rhythm and Chord Control for Transformer-Based Text-to-Music Generation", 484 | "Link": "https://huggingface.co/papers/2407.15060", 485 | "Upvotes": "7", 486 | "Date": "2024-07-23" 487 | }, 488 | { 489 | "Title": "Conditioned Language Policy: A General Framework for Steerable Multi-Objective Finetuning", 490 | "Link": "https://huggingface.co/papers/2407.15762", 491 | "Upvotes": "7", 492 | "Date": "2024-07-23" 493 | }, 494 | { 495 | "Title": "CGB-DM: Content and Graphic Balance Layout Generation with Transformer-based Diffusion Model", 496 | "Link": "https://huggingface.co/papers/2407.15233", 497 | "Upvotes": "6", 498 | "Date": "2024-07-23" 499 | }, 500 | { 501 | "Title": "MIBench: Evaluating Multimodal Large Language Models over Multiple Images", 502 | "Link": "https://huggingface.co/papers/2407.15272", 503 | "Upvotes": "6", 504 | "Date": "2024-07-23" 505 | }, 506 | { 507 | "Title": "Discrete Flow Matching", 508 | "Link": "https://huggingface.co/papers/2407.15595", 509 | "Upvotes": "5", 510 | "Date": "2024-07-23" 511 | }, 512 | { 513 | "Title": "Local All-Pair Correspondence for Point Tracking", 514 | "Link": "https://huggingface.co/papers/2407.15420", 515 | "Upvotes": "5", 516 | "Date": "2024-07-23" 517 | }, 518 | { 519 | "Title": "ThermalNeRF: Thermal Radiance Fields", 520 | "Link": "https://huggingface.co/papers/2407.15337", 521 | "Upvotes": "5", 522 | "Date": "2024-07-23" 523 | }, 524 | { 525 | "Title": "Temporal Residual Jacobians For Rig-free Motion Transfer", 526 | "Link": "https://huggingface.co/papers/2407.14958", 527 | "Upvotes": "5", 528 | "Date": "2024-07-23" 529 | }, 530 | { 531 | "Title": "GET-Zero: Graph Embodiment Transformer for Zero-shot Embodiment Generalization", 532 | "Link": "https://huggingface.co/papers/2407.15002", 533 | "Upvotes": "4", 534 | "Date": "2024-07-23" 535 | }, 536 | { 537 | "Title": "Visual Haystacks: Answering Harder Questions About Sets of Images", 538 | "Link": "https://huggingface.co/papers/2407.13766", 539 | "Upvotes": "2", 540 | "Date": "2024-07-23" 541 | } 542 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.3 2 | fitz==0.0.1.dev2 3 | Jinja2==3.1.3 4 | openai==1.38.0 5 | pdfminer.six==20231228 6 | Pillow==10.4.0 7 | Requests==2.32.3 8 | -------------------------------------------------------------------------------- /retrieval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | from FlagEmbedding import FlagModel 4 | 5 | class TextVectorRetriever: 6 | def __init__(self, model_path): 7 | self.model = FlagModel(model_path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：", use_fp16=True) 8 | self.abstract = [] 9 | self.title = [] 10 | self.link = [] 11 | self.embeddings = [] 12 | 13 | def save_embeddings(self, filename='embeddings.pkl'): 14 | package = {"title": self.title,"link": self.link, "abstract": self.abstract, "embeddings": self.embeddings} 15 | with open(filename, 'wb') as f: 16 | pickle.dump(package, f) 17 | 18 | def load_embeddings(self, filename='embeddings.pkl'): 19 | with open(filename, 'rb') as f: 20 | package = pickle.load(f) 21 | self.abstract = package['abstract'] 22 | self.embeddings = package['embeddings'] 23 | 24 | def extract_sentences_from_json(self, json_data): 25 | for item in json_data: 26 | abstract = item.get('Abstract', '') 27 | title = item.get('Title', '') 28 | link = item.get('ArXiv Link', '') 29 | 30 | if abstract: 31 | abstract = f"{abstract}" # Combine title and abstract 32 | embedding = self.model.encode([abstract]) 33 | self.abstract.append(abstract) 34 | self.title.append(title) 35 | self.link.append(link) 36 | self.embeddings.append(embedding) 37 | 38 | def calculate_similarity(self, query): 39 | result = [] 40 | 41 | query_embedding = self.model.encode_queries([query]) 42 | 43 | for i, sentence_embedding in enumerate(self.embeddings): 44 | similarity_score = query_embedding[0] @ sentence_embedding.T 45 | result.append((similarity_score, self.title[i], self.link[i])) 46 | 47 | return result 48 | 49 | @staticmethod 50 | def top_n_sentences(results, n): 51 | sorted_results = sorted(results, key=lambda x: x[0], reverse=True) 52 | return sorted_results[:n] 53 | 54 | def main(): 55 | retriever = TextVectorRetriever('BAAI/bge-small-zh-v1.5') 56 | 57 | # Assuming json_data contains the JSON data you provided 58 | with open('total_papers.json', 'r', encoding='utf-8') as f: 59 | json_data = json.load(f) 60 | 61 | retriever.extract_sentences_from_json(json_data) 62 | retriever.save_embeddings() 63 | 64 | retriever.load_embeddings() 65 | query = """The emergence of advanced neural networks has opened up new ways in automated code generation from conceptual models, promising to enhance software development processes. This paper presents a preliminary evaluation of GPT-4-Vision, a state-of-the-art deep learning model, and its capabilities in transforming Unified Modeling Language (UML) class diagrams into fully operating Java class files. In our study, we used exported images of 18 class diagrams comprising 10 single-class and 8 multi-class diagrams. We used 3 different prompts for each input, and we manually evaluated the results. We created a scoring system in which we scored the occurrence of elements found in the diagram within the source code. On average, the model was able to generate source code for 88% of the elements shown in the diagrams. Our results indicate that GPT-4-Vision exhibits proficiency in handling single-class UML diagrams, successfully transforming them into syntactically correct class files. However, for multi-class UML diagrams, the model's performance is weaker compared to single-class diagrams. In summary, further investigations are necessary to exploit the model's potential completely.""" 66 | results = retriever.calculate_similarity(query) 67 | top_sentences = TextVectorRetriever.top_n_sentences(results, 3) 68 | 69 | for score, title, link in top_sentences: 70 | print(f"分数: {score} 标题: {title} 链接：{link}") 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /save_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import date 3 | 4 | # 加载已有的papers数据 5 | with open('total_papers.json', 'r') as file: 6 | papers = json.load(file) 7 | 8 | with open('extracted_data.json', 'r') as file: 9 | new_papers = json.load(file) 10 | 11 | # 检查新的papers是否已存在于已有的papers中 12 | for new_paper in new_papers: 13 | title_exists = False 14 | for paper in papers: 15 | if paper["Title"] == new_paper["Title"]: 16 | title_exists = True 17 | break 18 | if not title_exists: 19 | new_paper["Date"] = str(date.today()) 20 | papers.append(new_paper) 21 | 22 | # 将更新后的papers保存回文件中 23 | with open('total_papers.json', 'w') as file: 24 | json.dump(papers, file, indent=4) 25 | -------------------------------------------------------------------------------- /show_pdf.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import json 3 | import requests 4 | from pdfminer.high_level import extract_text 5 | import os 6 | from PIL import Image 7 | import fitz 8 | from openai import OpenAI 9 | import time 10 | from config import API_KEY, BASE_URL 11 | 12 | client = OpenAI( 13 | api_key=API_KEY, 14 | base_url=BASE_URL, 15 | ) 16 | 17 | def askLLM(message, retries=10, delay=8): 18 | """ 19 | 发送消息给LLM，如果失败则等待一段时间后重试。 20 | 21 | :param message: 发送到LLM的消息列表 22 | :param retries: 重试次数，默认为3次 23 | :param delay: 重试之间的延迟时间，单位为秒，默认为2秒 24 | :return: LLM的响应内容或者在所有重试失败后返回None 25 | """ 26 | for attempt in range(retries): 27 | try: 28 | response = client.chat.completions.create( 29 | model="moonshot-v1-32k", 30 | temperature=0.7, 31 | max_tokens=2000, 32 | messages=message, 33 | ) 34 | # 检查response是否包含所需的数据 35 | if response.choices and response.choices[0].message.content: 36 | return response.choices[0].message.content 37 | else: 38 | raise ValueError("Response from LLM is missing content.") 39 | except Exception as e: 40 | print(f"Attempt {attempt + 1} failed with error: {e}") 41 | if attempt < retries - 1: 42 | print(f"Waiting {delay} seconds before retrying...") 43 | time.sleep(delay) 44 | else: 45 | print("Max retries reached. No response received from LLM.") 46 | return None 47 | 48 | 49 | # 从本地 JSON 文件中读取文章数据 50 | def load_articles_from_json(json_file): 51 | with open(json_file, "r", encoding="utf-8") as f: 52 | articles = json.load(f) 53 | return articles 54 | 55 | 56 | # 函数用于检测图像的颜色丰富度 57 | def is_colorful(image, threshold=5000): 58 | # 将图像转换为RGB模式 59 | rgb_image = image.convert("RGB") 60 | 61 | # 获取图像的所有像素 62 | pixels = list(rgb_image.getdata()) 63 | 64 | # 计算颜色数量 65 | colors = set(pixels) 66 | 67 | # 如果颜色数量超过阈值，则判断为颜色丰富 68 | return len(colors) > threshold 69 | 70 | # 创建存储图片的文件夹 71 | output_folder = "top_half_images" 72 | if not os.path.exists(output_folder): 73 | os.makedirs(output_folder) 74 | import os 75 | 76 | # 创建存储PDF文件的文件夹 77 | pdf_folder = "pdf_files" 78 | if not os.path.exists(pdf_folder): 79 | os.makedirs(pdf_folder) 80 | 81 | # 处理单篇文章 82 | def process_article(article, index): 83 | # 下载 PDF 文件 84 | response = requests.get(article["PDF Link"]) 85 | pdf_filename = f"output_{index}.pdf" 86 | pdf_path = os.path.join(pdf_folder, pdf_filename) 87 | with open(pdf_path, "wb") as f: 88 | f.write(response.content) 89 | 90 | # 提取 PDF 文本信息 91 | text = extract_text(pdf_path) 92 | 93 | text = text[:25000] 94 | 95 | # 提取文章摘要 96 | summary_message_1 = [ 97 | {"role": "system", "content": "通俗幽默地用一段连续的200字文字以内介绍这个文章讲了什么，尽量用通俗的语言代替专业词汇，但不要丧失准确性,使用中文。" 98 | "先用一句话在文案的开头给出这个文章的技术可以用来干什么，要有趣的使用。"}, 99 | {"role": "user", "content": f"通俗幽默地用一段连续的200字文字以内介绍这个文章讲了什么，但不要丧失准确性,使用中文。一半内容通俗介绍这个文章技术可以用来干什么，另一半可以介绍一些技术细节。文章内容：{text}。"}, 100 | ] 101 | summary1 = askLLM(summary_message_1) 102 | 103 | # 提取文章细节 104 | summary_message_2 = [ 105 | {"role": "system", "content": """你是一名AI领域专家，根据发送给你的论文，直接输出你的论文解读笔记。不要打招呼，直接输出，你需要完成以下任务： 106 | 回答关键问题： 107 | 1. 主要解决了什么问题？ 108 | 2. 提出了什么解决方案？ 109 | 3. 解决方案中核心的方法/步骤/策略是什么？ 110 | 4. 结论是什么？ 111 | 5. 有什么限制条件？ 112 | 请有条理地分点组织以上信息，确保涵盖每一个点。"""}, 113 | {"role": "user", "content": f"""文章内容：{text}。保证准确性和专业性，但尽可能以容易听懂的方式进行输出。输出不需要特殊的markdown，直接分点输出即可。 114 | Output format: 115 | 1. 主要解决了什么问题？ 116 | 2. 提出了什么解决方案？ 117 | 3. 解决方案中核心的方法/步骤/策略是什么？ 118 | 4. 结论是什么？ 119 | 5. 有什么限制条件？"""}, 120 | ] 121 | summary2 = askLLM(summary_message_2) 122 | 123 | summary = summary1 + "\n\n\nMore Details:\n\n" + summary2 124 | 125 | # 提取文章标题 126 | tag_message = [ 127 | {"role": "system", "content": """根据用户发给你的文章摘要，进行文章分类，其中有五个分类： 128 | 机器学习（Machine Learning, ML）：包括各种算法和模型，是AI的基础，涵盖了监督学习、无监督学习、强化学习等。 129 | 130 | 深度学习（Deep Learning, DL）：作为ML的一个子集，专注于使用神经网络处理复杂的数据模式，特别是在图像、语音和序列数据上的应用。 131 | 132 | 自然语言处理（Natural Language Processing, NLP）：专注于语言的理解和生成，是一个高度专业化的领域，通常需要特定的技术和模型。 133 | 134 | 计算机视觉（Computer Vision, CV）：专注于图像和视频的分析和理解，使用DL技术在图像识别、物体检测等方面取得了显著进展。 135 | 136 | 智能系统和应用（Intelligent Systems and Applications, ISA）：包括将AI技术应用于特定行业的解决方案，如医疗、金融、交通等，这个分类强调AI技术的实际应用和跨学科整合。 137 | 138 | 仅输出一个最适合的分类的缩写。 139 | 140 | 如：CV 141 | 142 | 直接输出分类的类型，不要输出原因或其它无关的内容。 143 | 144 | 输出的格式仅有五类，即：ML,DL,NLP,CV,ISA 145 | """}, 146 | {"role": "user", "content": f"论文摘要内容：\n{article['Abstract']}\n\n"}, 147 | ] 148 | tag = askLLM(tag_message) 149 | 150 | # 提取文章标题 151 | title_message = [ 152 | {"role": "system", "content": "用一个幽默且贴近实际使用的语言，为文章取一个有意思的标题.直接输出标题内容，不要输出任何其它无关内容。"}, 153 | {"role": "user", "content": f"用一个幽默且贴近实际使用的语言，为文章取一个有意思的标题。文章摘要内容：\n{summary}\n\n文章专业摘要内容：\n{article['Abstract']}\n\n直接用中文输出标题内容。"}, 154 | ] 155 | title = askLLM(title_message) 156 | 157 | # 获取PDF的第一页截图并保存为图片文件 158 | pdf_document = fitz.open(pdf_path) 159 | page = pdf_document.load_page(0) # 获取第一页 160 | pix = page.get_pixmap() 161 | 162 | # 转换为PIL Image对象 163 | image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) 164 | 165 | # 检测图像的颜色丰富度 166 | if is_colorful(image): 167 | # 如果图像颜色丰富，保存上半部分的截图 168 | half_height = image.height // 2 169 | top_half = image.crop((0, 0, image.width, half_height)) 170 | 171 | # 为图片建立单独的命名 172 | image_filename = f"top_half_image_{index}.png" 173 | image_path = os.path.join(output_folder, image_filename) 174 | 175 | # 保存上半部分的截图 176 | top_half.save(image_path) 177 | 178 | # 更新文章字典中的路径或其他索引 179 | article["top_half_image_path"] = image_path 180 | 181 | return {"title": title, "summary": summary, "original_title": article["Title"], "tag": tag} 182 | 183 | # 处理每篇文章并将结果写入 JSON 文件 184 | def process_article_concurrent(article, index): 185 | print(f"reading {index+1}/{len(articles)} papers") 186 | result = process_article(article, index) 187 | return { 188 | "original_title": result['original_title'], 189 | "title": result['title'], 190 | "summary": result['summary'], 191 | "arxiv_link": article['ArXiv Link'], 192 | "top_half_image_path": article.get("top_half_image_path", None), 193 | "tag": result['tag'] 194 | } 195 | 196 | def process_articles_to_json_concurrent(articles): 197 | output_data = [] 198 | with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: 199 | futures = {executor.submit(process_article_concurrent, article, i): i for i, article in enumerate(articles)} 200 | for future in concurrent.futures.as_completed(futures): 201 | try: 202 | output_data.append(future.result()) 203 | except Exception as exc: 204 | print(f"Article {futures[future]} generated an exception: {exc}") 205 | 206 | with open("articles_summary.json", "w", encoding="utf-8") as f: 207 | json.dump(output_data, f, ensure_ascii=False, indent=4) 208 | 209 | # 调用处理函数并将结果存入 JSON 文件 210 | articles = load_articles_from_json("unique_data.json") 211 | process_articles_to_json_concurrent(articles) 212 | 213 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | from datetime import datetime, timedelta 5 | 6 | # 获取今天的日期 7 | today = datetime.now().date() 8 | 9 | # 创建一个列表来存储所有论文数据 10 | all_papers_data = [] 11 | 12 | # 循环从七天前到今天 13 | for i in range(7): 14 | # 计算日期 15 | date = (today - timedelta(days=i)).strftime('%Y-%m-%d') 16 | 17 | # 构造目标网址 18 | url = f'https://huggingface.co/papers?date={date}' 19 | 20 | # 发送HTTP GET请求 21 | response = requests.get(url) 22 | 23 | # 检查请求是否成功 24 | if response.status_code == 200: 25 | # 使用BeautifulSoup解析HTML内容 26 | soup = BeautifulSoup(response.text, 'html.parser') 27 | 28 | # 提取所有论文的信息 29 | papers = soup.find_all('article', class_='relative flex flex-col overflow-hidden rounded-xl border') 30 | 31 | for paper in papers: 32 | # 提取链接 33 | link = paper.find('a', class_='shadow-alternate-sm')['href'] 34 | 35 | # 提取标题 36 | title = paper.find('h3').find('a').get_text(strip=True) 37 | 38 | # 提取点赞数 39 | leading_none_divs = paper.select('div.leading-none') 40 | for div in leading_none_divs: 41 | upvotes = div.get_text(strip=True) 42 | print(upvotes) 43 | 44 | # 将结果添加到列表中 45 | all_papers_data.append({ 46 | 'Title': title, 47 | 'Link': f"https://huggingface.co{link}", 48 | 'Upvotes': upvotes, 49 | 'Date': date 50 | }) 51 | else: 52 | print(f'Failed to retrieve the webpage for {date}') 53 | 54 | # 将列表转换为JSON格式的字符串 55 | json_data = json.dumps(all_papers_data, indent=4) 56 | 57 | # 写入JSON文件 58 | with open('papers.json', 'w') as json_file: 59 | json_file.write(json_data) 60 | 61 | print('Data for the past 7 days written to papers.json successfully.') -------------------------------------------------------------------------------- /template_image/card.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangJinyu/AI-Paper/0fc03ecb3f0c01d3ca21bff43f95bb71fb9cf262/template_image/card.jpg -------------------------------------------------------------------------------- /template_image/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XiangJinyu/AI-Paper/0fc03ecb3f0c01d3ca21bff43f95bb71fb9cf262/template_image/logo.jpg -------------------------------------------------------------------------------- /unique_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Title": "AppWorld: A Controllable World of Apps and People for Benchmarking\n Interactive Coding Agents", 4 | "Abstract": "Autonomous agents that address day-to-day digital tasks (e.g., ordering\ngroceries for a household), must not only operate multiple apps (e.g., notes,\nmessaging, shopping app) via APIs, but also generate rich code with complex\ncontrol flow in an iterative manner based on their interaction with the\nenvironment. However, existing benchmarks for tool use are inadequate, as they\nonly cover tasks that require a simple sequence of API calls.\n To remedy this gap, we built AppWorld Engine, a high-quality\nexecution environment (60K lines of code) of 9 day-to-day apps operable via 457\nAPIs and populated with realistic digital activities simulating the lives of\n~100 fictitious users. We then created AppWorld Benchmark (40K lines\nof code), a suite of 750 natural, diverse, and challenging autonomous agent\ntasks requiring rich and interactive code generation. It supports robust\nprogrammatic evaluation with state-based unit tests, allowing for different\nways of completing a task while also checking for unexpected changes, i.e.,\ncollateral damage. The state-of-the-art LLM, GPT-4o, solves only ~49% of our\n'normal' tasks and ~30% of 'challenge' tasks, while other models solve at least\n16% fewer. This highlights the benchmark's difficulty and AppWorld's potential\nto push the frontiers of interactive coding agents. The project website is\navailable at https://appworld.dev/.", 5 | "ArXiv Link": "https://arxiv.org/abs/2407.18901", 6 | "PDF Link": "https://arxiv.org/pdf/2407.18901", 7 | "Upvotes": "3" 8 | }, 9 | { 10 | "Title": "Floating No More: Object-Ground Reconstruction from a Single Image", 11 | "Abstract": "Recent advancements in 3D object reconstruction from single images have\nprimarily focused on improving the accuracy of object shapes. Yet, these\ntechniques often fail to accurately capture the inter-relation between the\nobject, ground, and camera. As a result, the reconstructed objects often appear\nfloating or tilted when placed on flat surfaces. This limitation significantly\naffects 3D-aware image editing applications like shadow rendering and object\npose manipulation. To address this issue, we introduce ORG (Object\nReconstruction with Ground), a novel task aimed at reconstructing 3D object\ngeometry in conjunction with the ground surface. Our method uses two compact\npixel-level representations to depict the relationship between camera, object,\nand ground. Experiments show that the proposed ORG model can effectively\nreconstruct object-ground geometry on unseen data, significantly enhancing the\nquality of shadow generation and pose manipulation compared to conventional\nsingle-image 3D reconstruction techniques.", 12 | "ArXiv Link": "https://arxiv.org/abs/2407.18914", 13 | "PDF Link": "https://arxiv.org/pdf/2407.18914", 14 | "Upvotes": "2" 15 | }, 16 | { 17 | "Title": "Wolf: Captioning Everything with a World Summarization Framework", 18 | "Abstract": "We propose Wolf, a WOrLd summarization Framework for accurate video\ncaptioning. Wolf is an automated captioning framework that adopts a\nmixture-of-experts approach, leveraging complementary strengths of Vision\nLanguage Models (VLMs). By utilizing both image and video models, our framework\ncaptures different levels of information and summarizes them efficiently. Our\napproach can be applied to enhance video understanding, auto-labeling, and\ncaptioning. To evaluate caption quality, we introduce CapScore, an LLM-based\nmetric to assess the similarity and quality of generated captions compared to\nthe ground truth captions. We further build four human-annotated datasets in\nthree domains: autonomous driving, general scenes, and robotics, to facilitate\ncomprehensive comparisons. We show that Wolf achieves superior captioning\nperformance compared to state-of-the-art approaches from the research community\n(VILA1.5, CogAgent) and commercial solutions (Gemini-Pro-1.5, GPT-4V). For\ninstance, in comparison with GPT-4V, Wolf improves CapScore both quality-wise\nby 55.6% and similarity-wise by 77.4% on challenging driving videos. Finally,\nwe establish a benchmark for video captioning and introduce a leaderboard,\naiming to accelerate advancements in video understanding, captioning, and data\nalignment. Leaderboard: https://wolfv0.github.io/leaderboard.html.", 19 | "ArXiv Link": "https://arxiv.org/abs/2407.18908", 20 | "PDF Link": "https://arxiv.org/pdf/2407.18908", 21 | "Upvotes": "1" 22 | }, 23 | { 24 | "Title": "SHIC: Shape-Image Correspondences with no Keypoint Supervision", 25 | "Abstract": "Canonical surface mapping generalizes keypoint detection by assigning each\npixel of an object to a corresponding point in a 3D template. Popularised by\nDensePose for the analysis of humans, authors have since attempted to apply the\nconcept to more categories, but with limited success due to the high cost of\nmanual supervision. In this work, we introduce SHIC, a method to learn\ncanonical maps without manual supervision which achieves better results than\nsupervised methods for most categories. Our idea is to leverage foundation\ncomputer vision models such as DINO and Stable Diffusion that are open-ended\nand thus possess excellent priors over natural categories. SHIC reduces the\nproblem of estimating image-to-template correspondences to predicting\nimage-to-image correspondences using features from the foundation models. The\nreduction works by matching images of the object to non-photorealistic renders\nof the template, which emulates the process of collecting manual annotations\nfor this task. These correspondences are then used to supervise high-quality\ncanonical maps for any object of interest. We also show that image generators\ncan further improve the realism of the template views, which provide an\nadditional source of supervision for the model.", 26 | "ArXiv Link": "https://arxiv.org/abs/2407.18907", 27 | "PDF Link": "https://arxiv.org/pdf/2407.18907", 28 | "Upvotes": "1" 29 | }, 30 | { 31 | "Title": "Lessons from Learning to Spin \"Pens\"", 32 | "Abstract": "In-hand manipulation of pen-like objects is an important skill in our daily\nlives, as many tools such as hammers and screwdrivers are similarly shaped.\nHowever, current learning-based methods struggle with this task due to a lack\nof high-quality demonstrations and the significant gap between simulation and\nthe real world. In this work, we push the boundaries of learning-based in-hand\nmanipulation systems by demonstrating the capability to spin pen-like objects.\nWe first use reinforcement learning to train an oracle policy with privileged\ninformation and generate a high-fidelity trajectory dataset in simulation. This\nserves two purposes: 1) pre-training a sensorimotor policy in simulation; 2)\nconducting open-loop trajectory replay in the real world. We then fine-tune the\nsensorimotor policy using these real-world trajectories to adapt it to the real\nworld dynamics. With less than 50 trajectories, our policy learns to rotate\nmore than ten pen-like objects with different physical properties for multiple\nrevolutions. We present a comprehensive analysis of our design choices and\nshare the lessons learned during development.", 33 | "ArXiv Link": "https://arxiv.org/abs/2407.18902", 34 | "PDF Link": "https://arxiv.org/pdf/2407.18902", 35 | "Upvotes": "-" 36 | }, 37 | { 38 | "Title": "VSSD: Vision Mamba with Non-Casual State Space Duality", 39 | "Abstract": "Vision transformers have significantly advanced the field of computer vision,\noffering robust modeling capabilities and global receptive field. However,\ntheir high computational demands limit their applicability in processing long\nsequences. To tackle this issue, State Space Models (SSMs) have gained\nprominence in vision tasks as they offer linear computational complexity.\nRecently, State Space Duality (SSD), an improved variant of SSMs, was\nintroduced in Mamba2 to enhance model performance and efficiency. However, the\ninherent causal nature of SSD/SSMs restricts their applications in non-causal\nvision tasks. To address this limitation, we introduce Visual State Space\nDuality (VSSD) model, which has a non-causal format of SSD. Specifically, we\npropose to discard the magnitude of interactions between the hidden state and\ntokens while preserving their relative weights, which relieves the dependencies\nof token contribution on previous tokens. Together with the involvement of\nmulti-scan strategies, we show that the scanning results can be integrated to\nachieve non-causality, which not only improves the performance of SSD in vision\ntasks but also enhances its efficiency. We conduct extensive experiments on\nvarious benchmarks including image classification, detection, and segmentation,\nwhere VSSD surpasses existing state-of-the-art SSM-based models. Code and\nweights are available at https://github.com/YuHengsss/VSSD.", 40 | "ArXiv Link": "https://arxiv.org/abs/2407.18559", 41 | "PDF Link": "https://arxiv.org/pdf/2407.18559", 42 | "Upvotes": "-" 43 | }, 44 | { 45 | "Title": "Diffree: Text-Guided Shape Free Object Inpainting with Diffusion Model", 46 | "Abstract": "This paper addresses an important problem of object addition for images with\nonly text guidance. It is challenging because the new object must be integrated\nseamlessly into the image with consistent visual context, such as lighting,\ntexture, and spatial location. While existing text-guided image inpainting\nmethods can add objects, they either fail to preserve the background\nconsistency or involve cumbersome human intervention in specifying bounding\nboxes or user-scribbled masks. To tackle this challenge, we introduce Diffree,\na Text-to-Image (T2I) model that facilitates text-guided object addition with\nonly text control. To this end, we curate OABench, an exquisite synthetic\ndataset by removing objects with advanced image inpainting techniques. OABench\ncomprises 74K real-world tuples of an original image, an inpainted image with\nthe object removed, an object mask, and object descriptions. Trained on OABench\nusing the Stable Diffusion model with an additional mask prediction module,\nDiffree uniquely predicts the position of the new object and achieves object\naddition with guidance from only text. Extensive experiments demonstrate that\nDiffree excels in adding new objects with a high success rate while maintaining\nbackground consistency, spatial appropriateness, and object relevance and\nquality.", 47 | "ArXiv Link": "https://arxiv.org/abs/2407.16982", 48 | "PDF Link": "https://arxiv.org/pdf/2407.16982", 49 | "Upvotes": "33" 50 | }, 51 | { 52 | "Title": "LAMBDA: A Large Model Based Data Agent", 53 | "Abstract": "We introduce ``LAMBDA,\" a novel open-source, code-free multi-agent data\nanalysis system that that harnesses the power of large models. LAMBDA is\ndesigned to address data analysis challenges in complex data-driven\napplications through the use of innovatively designed data agents that operate\niteratively and generatively using natural language. At the core of LAMBDA are\ntwo key agent roles: the programmer and the inspector, which are engineered to\nwork together seamlessly. Specifically, the programmer generates code based on\nthe user's instructions and domain-specific knowledge, enhanced by advanced\nmodels. Meanwhile, the inspector debugs the code when necessary. To ensure\nrobustness and handle adverse scenarios, LAMBDA features a user interface that\nallows direct user intervention in the operational loop. Additionally, LAMBDA\ncan flexibly integrate external models and algorithms through our knowledge\nintegration mechanism, catering to the needs of customized data analysis.\nLAMBDA has demonstrated strong performance on various machine learning\ndatasets. It has the potential to enhance data science practice and analysis\nparadigm by seamlessly integrating human and artificial intelligence, making it\nmore accessible, effective, and efficient for individuals from diverse\nbackgrounds. The strong performance of LAMBDA in solving data science problems\nis demonstrated in several case studies, which are presented at\nhttps://www.polyu.edu.hk/ama/cmfai/lambda.html.", 54 | "ArXiv Link": "https://arxiv.org/abs/2407.17535", 55 | "PDF Link": "https://arxiv.org/pdf/2407.17535", 56 | "Upvotes": "26" 57 | }, 58 | { 59 | "Title": "AMEX: Android Multi-annotation Expo Dataset for Mobile GUI Agents", 60 | "Abstract": "AI agents have drawn increasing attention mostly on their ability to perceive\nenvironments, understand tasks, and autonomously achieve goals. To advance\nresearch on AI agents in mobile scenarios, we introduce the Android\nMulti-annotation EXpo (AMEX), a comprehensive, large-scale dataset designed for\ngeneralist mobile GUI-control agents. Their capabilities of completing complex\ntasks by directly interacting with the graphical user interface (GUI) on mobile\ndevices are trained and evaluated with the proposed dataset. AMEX comprises\nover 104K high-resolution screenshots from 110 popular mobile applications,\nwhich are annotated at multiple levels. Unlike existing mobile device-control\ndatasets, e.g., MoTIF, AitW, etc., AMEX includes three levels of annotations:\nGUI interactive element grounding, GUI screen and element functionality\ndescriptions, and complex natural language instructions, each averaging 13\nsteps with stepwise GUI-action chains. We develop this dataset from a more\ninstructive and detailed perspective, complementing the general settings of\nexisting datasets. Additionally, we develop a baseline model SPHINX Agent and\ncompare its performance across state-of-the-art agents trained on other\ndatasets. To facilitate further research, we open-source our dataset, models,\nand relevant evaluation tools. The project is available at\nhttps://yuxiangchai.github.io/AMEX/", 61 | "ArXiv Link": "https://arxiv.org/abs/2407.17490", 62 | "PDF Link": "https://arxiv.org/pdf/2407.17490", 63 | "Upvotes": "24" 64 | }, 65 | { 66 | "Title": "Very Large-Scale Multi-Agent Simulation in AgentScope", 67 | "Abstract": "Recent advances in large language models (LLMs) have opened new avenues for\napplying multi-agent systems in very large-scale simulations. However, there\nremain several challenges when conducting multi-agent simulations with existing\nplatforms, such as limited scalability and low efficiency, unsatisfied agent\ndiversity, and effort-intensive management processes. To address these\nchallenges, we develop several new features and components for AgentScope, a\nuser-friendly multi-agent platform, enhancing its convenience and flexibility\nfor supporting very large-scale multi-agent simulations. Specifically, we\npropose an actor-based distributed mechanism as the underlying technological\ninfrastructure towards great scalability and high efficiency, and provide\nflexible environment support for simulating various real-world scenarios, which\nenables parallel execution of multiple agents, centralized workflow\norchestration, and both inter-agent and agent-environment interactions among\nagents. Moreover, we integrate an easy-to-use configurable tool and an\nautomatic background generation pipeline in AgentScope, simplifying the process\nof creating agents with diverse yet detailed background settings. Last but not\nleast, we provide a web-based interface for conveniently monitoring and\nmanaging a large number of agents that might deploy across multiple devices. We\nconduct a comprehensive simulation to demonstrate the effectiveness of the\nproposed enhancements in AgentScope, and provide detailed observations and\ndiscussions to highlight the great potential of applying multi-agent systems in\nlarge-scale simulations. The source code is released on GitHub at\nhttps://github.com/modelscope/agentscope to inspire further research and\ndevelopment in large-scale multi-agent simulations.", 68 | "ArXiv Link": "https://arxiv.org/abs/2407.17789", 69 | "PDF Link": "https://arxiv.org/pdf/2407.17789", 70 | "Upvotes": "18" 71 | }, 72 | { 73 | "Title": "BetterDepth: Plug-and-Play Diffusion Refiner for Zero-Shot Monocular\n Depth Estimation", 74 | "Abstract": "By training over large-scale datasets, zero-shot monocular depth estimation\n(MDE) methods show robust performance in the wild but often suffer from\ninsufficiently precise details. Although recent diffusion-based MDE approaches\nexhibit appealing detail extraction ability, they still struggle in\ngeometrically challenging scenes due to the difficulty of gaining robust\ngeometric priors from diverse datasets. To leverage the complementary merits of\nboth worlds, we propose BetterDepth to efficiently achieve geometrically\ncorrect affine-invariant MDE performance while capturing fine-grained details.\nSpecifically, BetterDepth is a conditional diffusion-based refiner that takes\nthe prediction from pre-trained MDE models as depth conditioning, in which the\nglobal depth context is well-captured, and iteratively refines details based on\nthe input image. For the training of such a refiner, we propose global\npre-alignment and local patch masking methods to ensure the faithfulness of\nBetterDepth to depth conditioning while learning to capture fine-grained scene\ndetails. By efficient training on small-scale synthetic datasets, BetterDepth\nachieves state-of-the-art zero-shot MDE performance on diverse public datasets\nand in-the-wild scenes. Moreover, BetterDepth can improve the performance of\nother MDE models in a plug-and-play manner without additional re-training.", 75 | "ArXiv Link": "https://arxiv.org/abs/2407.17952", 76 | "PDF Link": "https://arxiv.org/pdf/2407.17952", 77 | "Upvotes": "17" 78 | }, 79 | { 80 | "Title": "Course-Correction: Safety Alignment Using Synthetic Preferences", 81 | "Abstract": "The risk of harmful content generated by large language models (LLMs) becomes\na critical concern. This paper presents a systematic study on assessing and\nimproving LLMs' capability to perform the task of course-correction,\n\\ie, the model can steer away from generating harmful content autonomously. To\nstart with, we introduce the C^2-Eval benchmark for quantitative\nassessment and analyze 10 popular LLMs, revealing varying proficiency of\ncurrent safety-tuned LLMs in course-correction. To improve, we propose\nfine-tuning LLMs with preference learning, emphasizing the preference for\ntimely course-correction. Using an automated pipeline, we create\nC^2-Syn, a synthetic dataset with 750K pairwise preferences, to\nteach models the concept of timely course-correction through data-driven\npreference learning. Experiments on 2 LLMs, Llama2-Chat 7B and\nQwen2 7B, show that our method effectively enhances course-correction\nskills without affecting general performance. Additionally, it effectively\nimproves LLMs' safety, particularly in resisting jailbreak attacks.", 82 | "ArXiv Link": "https://arxiv.org/abs/2407.16637", 83 | "PDF Link": "https://arxiv.org/pdf/2407.16637", 84 | "Upvotes": "16" 85 | }, 86 | { 87 | "Title": "Data Mixture Inference: What do BPE Tokenizers Reveal about their\n Training Data?", 88 | "Abstract": "The pretraining data of today's strongest language models is opaque. In\nparticular, little is known about the proportions of various domains or\nlanguages represented. In this work, we tackle a task which we call data\nmixture inference, which aims to uncover the distributional make-up of training\ndata. We introduce a novel attack based on a previously overlooked source of\ninformation -- byte-pair encoding (BPE) tokenizers, used by the vast majority\nof modern language models. Our key insight is that the ordered list of merge\nrules learned by a BPE tokenizer naturally reveals information about the token\nfrequencies in its training data: the first merge is the most common byte pair,\nthe second is the most common pair after merging the first token, and so on.\nGiven a tokenizer's merge list along with data samples for each category of\ninterest, we formulate a linear program that solves for the proportion of each\ncategory in the tokenizer's training set. Importantly, to the extent to which\ntokenizer training data is representative of the pretraining data, we\nindirectly learn about the pretraining data. In controlled experiments, we show\nthat our attack recovers mixture ratios with high precision for tokenizers\ntrained on known mixtures of natural languages, programming languages, and data\nsources. We then apply our approach to off-the-shelf tokenizers released with\nrecent LMs. We confirm much publicly disclosed information about these models,\nand also make several new inferences: GPT-4o's tokenizer is much more\nmultilingual than its predecessors, training on 39% non-English data; Llama3\nextends GPT-3.5's tokenizer primarily for multilingual (48%) use; GPT-3.5's and\nClaude's tokenizers are trained on predominantly code (~60%). We hope our work\nsheds light on current design practices for pretraining data, and inspires\ncontinued research into data mixture inference for LMs.", 89 | "ArXiv Link": "https://arxiv.org/abs/2407.16607", 90 | "PDF Link": "https://arxiv.org/pdf/2407.16607", 91 | "Upvotes": "15" 92 | }, 93 | { 94 | "Title": "Efficient Inference of Vision Instruction-Following Models with Elastic\n Cache", 95 | "Abstract": "In the field of instruction-following large vision-language models (LVLMs),\nthe efficient deployment of these models faces challenges, notably due to the\nhigh memory demands of their key-value (KV) caches. Conventional cache\nmanagement strategies for LLMs focus on cache eviction, which often fails to\naddress the specific needs of multimodal instruction-following models.\nRecognizing this gap, in this paper, we introduce Elastic Cache, a novel\napproach that benefits from applying distinct acceleration methods for\ninstruction encoding and output generation stages. We investigate the metrics\nof importance in different stages and propose an importance-driven cache\nmerging strategy to prune redundancy caches. Instead of discarding less\nimportant caches, our strategy identifies important key/value vectors as anchor\npoints. Surrounding less important caches are then merged with these anchors,\nenhancing the preservation of contextual information in the KV caches while\nyielding an arbitrary acceleration ratio. For instruction encoding, we utilize\nthe frequency to evaluate the importance of caches. Regarding output\ngeneration, we prioritize tokens based on their distance with an offset, by\nwhich both the initial and most recent tokens are retained. Results on a range\nof LVLMs demonstrate that Elastic Cache not only boosts efficiency but also\nnotably outperforms existing pruning methods in language generation across\nvarious tasks. Code is available at https://github.com/liuzuyan/ElasticCache", 96 | "ArXiv Link": "https://arxiv.org/abs/2407.18121", 97 | "PDF Link": "https://arxiv.org/pdf/2407.18121", 98 | "Upvotes": "13" 99 | }, 100 | { 101 | "Title": "LKCell: Efficient Cell Nuclei Instance Segmentation with Large\n Convolution Kernels", 102 | "Abstract": "The segmentation of cell nuclei in tissue images stained with the blood dye\nhematoxylin and eosin (H&E) is essential for various clinical applications\nand analyses. Due to the complex characteristics of cellular morphology, a\nlarge receptive field is considered crucial for generating high-quality\nsegmentation. However, previous methods face challenges in achieving a balance\nbetween the receptive field and computational burden. To address this issue, we\npropose LKCell, a high-accuracy and efficient cell segmentation method. Its\ncore insight lies in unleashing the potential of large convolution kernels to\nachieve computationally efficient large receptive fields. Specifically, (1) We\ntransfer pre-trained large convolution kernel models to the medical domain for\nthe first time, demonstrating their effectiveness in cell segmentation. (2) We\nanalyze the redundancy of previous methods and design a new segmentation\ndecoder based on large convolution kernels. It achieves higher performance\nwhile significantly reducing the number of parameters. We evaluate our method\non the most challenging benchmark and achieve state-of-the-art results (0.5080\nmPQ) in cell nuclei instance segmentation with only 21.6% FLOPs compared with\nthe previous leading method. Our source code and models are available at\nhttps://github.com/hustvl/LKCell.", 103 | "ArXiv Link": "https://arxiv.org/abs/2407.18054", 104 | "PDF Link": "https://arxiv.org/pdf/2407.18054", 105 | "Upvotes": "7" 106 | }, 107 | { 108 | "Title": "Dallah: A Dialect-Aware Multimodal Large Language Model for Arabic", 109 | "Abstract": "Recent advancements have significantly enhanced the capabilities of\nMultimodal Large Language Models (MLLMs) in generating and understanding\nimage-to-text content. Despite these successes, progress is predominantly\nlimited to English due to the scarcity of high quality multimodal resources in\nother languages. This limitation impedes the development of competitive models\nin languages such as Arabic. To alleviate this situation, we introduce an\nefficient Arabic multimodal assistant, dubbed Dallah, that utilizes an advanced\nlanguage model based on LLaMA-2 to facilitate multimodal interactions. Dallah\ndemonstrates state-of-the-art performance in Arabic MLLMs. Through fine-tuning\nsix Arabic dialects, Dallah showcases its capability to handle complex\ndialectal interactions incorporating both textual and visual elements. The\nmodel excels in two benchmark tests: one evaluating its performance on Modern\nStandard Arabic (MSA) and another specifically designed to assess dialectal\nresponses. Beyond its robust performance in multimodal interaction tasks,\nDallah has the potential to pave the way for further development of\ndialect-aware Arabic MLLMs.", 110 | "ArXiv Link": "https://arxiv.org/abs/2407.18129", 111 | "PDF Link": "https://arxiv.org/pdf/2407.18129", 112 | "Upvotes": "7" 113 | }, 114 | { 115 | "Title": "The FIGNEWS Shared Task on News Media Narratives", 116 | "Abstract": "We present an overview of the FIGNEWS shared task, organized as part of the\nArabicNLP 2024 conference co-located with ACL 2024. The shared task addresses\nbias and propaganda annotation in multilingual news posts. We focus on the\nearly days of the Israel War on Gaza as a case study. The task aims to foster\ncollaboration in developing annotation guidelines for subjective tasks by\ncreating frameworks for analyzing diverse narratives highlighting potential\nbias and propaganda. In a spirit of fostering and encouraging diversity, we\naddress the problem from a multilingual perspective, namely within five\nlanguages: English, French, Arabic, Hebrew, and Hindi. A total of 17 teams\nparticipated in two annotation subtasks: bias (16 teams) and propaganda (6\nteams). The teams competed in four evaluation tracks: guidelines development,\nannotation quality, annotation quantity, and consistency. Collectively, the\nteams produced 129,800 data points. Key findings and implications for the field\nare discussed.", 117 | "ArXiv Link": "https://arxiv.org/abs/2407.18147", 118 | "PDF Link": "https://arxiv.org/pdf/2407.18147", 119 | "Upvotes": "6" 120 | }, 121 | { 122 | "Title": "Text-Driven Neural Collaborative Filtering Model for Paper Source\n Tracing", 123 | "Abstract": "Identifying significant references within the complex interrelations of a\ncitation knowledge graph is challenging, which encompasses connections through\ncitations, authorship, keywords, and other relational attributes. The Paper\nSource Tracing (PST) task seeks to automate the identification of pivotal\nreferences for given scholarly articles utilizing advanced data mining\ntechniques. In the KDD CUP 2024, we design a recommendation-based framework\ntailored for the PST task. This framework employs the Neural Collaborative\nFiltering (NCF) model to generate final predictions. To process the textual\nattributes of the papers and extract input features for the model, we utilize\nSciBERT, a pre-trained language model. According to the experimental results,\nour method achieved a score of 0.37814 on the Mean Average Precision (MAP)\nmetric, outperforming baseline models and ranking 11th among all participating\nteams. The source code is publicly available at\nhttps://github.com/MyLove-XAB/KDDCupFinal.", 124 | "ArXiv Link": "https://arxiv.org/abs/2407.17722", 125 | "PDF Link": "https://arxiv.org/pdf/2407.17722", 126 | "Upvotes": "3" 127 | }, 128 | { 129 | "Title": "OpenDevin: An Open Platform for AI Software Developers as Generalist\n Agents", 130 | "Abstract": "Software is one of the most powerful tools that we humans have at our\ndisposal; it allows a skilled programmer to interact with the world in complex\nand profound ways. At the same time, thanks to improvements in large language\nmodels (LLMs), there has also been a rapid development in AI agents that\ninteract with and affect change in their surrounding environments. In this\npaper, we introduce OpenDevin, a platform for the development of powerful and\nflexible AI agents that interact with the world in similar ways to those of a\nhuman developer: by writing code, interacting with a command line, and browsing\nthe web. We describe how the platform allows for the implementation of new\nagents, safe interaction with sandboxed environments for code execution,\ncoordination between multiple agents, and incorporation of evaluation\nbenchmarks. Based on our currently incorporated benchmarks, we perform an\nevaluation of agents over 15 challenging tasks, including software engineering\n(e.g., SWE-Bench) and web browsing (e.g., WebArena), among others. Released\nunder the permissive MIT license, OpenDevin is a community project spanning\nacademia and industry with more than 1.3K contributions from over 160\ncontributors and will improve going forward.", 131 | "ArXiv Link": "https://arxiv.org/abs/2407.16741", 132 | "PDF Link": "https://arxiv.org/pdf/2407.16741", 133 | "Upvotes": "56" 134 | }, 135 | { 136 | "Title": "VILA^2: VILA Augmented VILA", 137 | "Abstract": "Visual language models (VLMs) have rapidly progressed, driven by the success\nof large language models (LLMs). While model architectures and training\ninfrastructures advance rapidly, data curation remains under-explored. When\ndata quantity and quality become a bottleneck, existing work either directly\ncrawls more raw data from the Internet that does not have a guarantee of data\nquality or distills from black-box commercial models (e.g., GPT-4V / Gemini)\ncausing the performance upper bounded by that model. In this work, we introduce\na novel approach that includes a self-augment step and a specialist-augment\nstep to iteratively improve data quality and model performance. In the\nself-augment step, a VLM recaptions its own pretraining data to enhance data\nquality, and then retrains from scratch using this refined dataset to improve\nmodel performance. This process can iterate for several rounds. Once\nself-augmentation saturates, we employ several specialist VLMs finetuned from\nthe self-augmented VLM with domain-specific expertise, to further infuse\nspecialist knowledge into the generalist VLM through task-oriented recaptioning\nand retraining. With the combined self-augmented and specialist-augmented\ntraining, we introduce VILA^2 (VILA-augmented-VILA), a VLM family that\nconsistently improves the accuracy on a wide range of tasks over prior art, and\nachieves new state-of-the-art results on MMMU leaderboard among open-sourced\nmodels.", 138 | "ArXiv Link": "https://arxiv.org/abs/2407.17453", 139 | "PDF Link": "https://arxiv.org/pdf/2407.17453", 140 | "Upvotes": "33" 141 | }, 142 | { 143 | "Title": "HumanVid: Demystifying Training Data for Camera-controllable Human Image\n Animation", 144 | "Abstract": "Human image animation involves generating videos from a character photo,\nallowing user control and unlocking potential for video and movie production.\nWhile recent approaches yield impressive results using high-quality training\ndata, the inaccessibility of these datasets hampers fair and transparent\nbenchmarking. Moreover, these approaches prioritize 2D human motion and\noverlook the significance of camera motions in videos, leading to limited\ncontrol and unstable video generation.To demystify the training data, we\npresent HumanVid, the first large-scale high-quality dataset tailored for human\nimage animation, which combines crafted real-world and synthetic data. For the\nreal-world data, we compile a vast collection of copyright-free real-world\nvideos from the internet. Through a carefully designed rule-based filtering\nstrategy, we ensure the inclusion of high-quality videos, resulting in a\ncollection of 20K human-centric videos in 1080P resolution. Human and camera\nmotion annotation is accomplished using a 2D pose estimator and a SLAM-based\nmethod. For the synthetic data, we gather 2,300 copyright-free 3D avatar assets\nto augment existing available 3D assets. Notably, we introduce a rule-based\ncamera trajectory generation method, enabling the synthetic pipeline to\nincorporate diverse and precise camera motion annotation, which can rarely be\nfound in real-world data. To verify the effectiveness of HumanVid, we establish\na baseline model named CamAnimate, short for Camera-controllable Human\nAnimation, that considers both human and camera motions as conditions. Through\nextensive experimentation, we demonstrate that such simple baseline training on\nour HumanVid achieves state-of-the-art performance in controlling both human\npose and camera motions, setting a new benchmark. Code and data will be\npublicly available at https://github.com/zhenzhiwang/HumanVid/.", 145 | "ArXiv Link": "https://arxiv.org/abs/2407.17438", 146 | "PDF Link": "https://arxiv.org/pdf/2407.17438", 147 | "Upvotes": "19" 148 | }, 149 | { 150 | "Title": "DDK: Distilling Domain Knowledge for Efficient Large Language Models", 151 | "Abstract": "Despite the advanced intelligence abilities of large language models (LLMs)\nin various applications, they still face significant computational and storage\ndemands. Knowledge Distillation (KD) has emerged as an effective strategy to\nimprove the performance of a smaller LLM (i.e., the student model) by\ntransferring knowledge from a high-performing LLM (i.e., the teacher model).\nPrevailing techniques in LLM distillation typically use a black-box model API\nto generate high-quality pretrained and aligned datasets, or utilize white-box\ndistillation by altering the loss function to better transfer knowledge from\nthe teacher LLM. However, these methods ignore the knowledge differences\nbetween the student and teacher LLMs across domains. This results in excessive\nfocus on domains with minimal performance gaps and insufficient attention to\ndomains with large gaps, reducing overall performance. In this paper, we\nintroduce a new LLM distillation framework called DDK, which dynamically\nadjusts the composition of the distillation dataset in a smooth manner\naccording to the domain performance differences between the teacher and student\nmodels, making the distillation process more stable and effective. Extensive\nevaluations show that DDK significantly improves the performance of student\nmodels, outperforming both continuously pretrained baselines and existing\nknowledge distillation methods by a large margin.", 152 | "ArXiv Link": "https://arxiv.org/abs/2407.16154", 153 | "PDF Link": "https://arxiv.org/pdf/2407.16154", 154 | "Upvotes": "16" 155 | }, 156 | { 157 | "Title": "PERSONA: A Reproducible Testbed for Pluralistic Alignment", 158 | "Abstract": "The rapid advancement of language models (LMs) necessitates robust alignment\nwith diverse user values. However, current preference optimization approaches\noften fail to capture the plurality of user opinions, instead reinforcing\nmajority viewpoints and marginalizing minority perspectives. We introduce\nPERSONA, a reproducible test bed designed to evaluate and improve pluralistic\nalignment of LMs. We procedurally generate diverse user profiles from US census\ndata, resulting in 1,586 synthetic personas with varied demographic and\nidiosyncratic attributes. We then generate a large-scale evaluation dataset\ncontaining 3,868 prompts and 317,200 feedback pairs obtained from our synthetic\npersonas. Leveraging this dataset, we systematically evaluate LM capabilities\nin role-playing diverse users, verified through human judges, and the\nestablishment of both a benchmark, PERSONA Bench, for pluralistic alignment\napproaches as well as an extensive dataset to create new and future benchmarks.\nThe full dataset and benchmarks are available here:\nhttps://www.synthlabs.ai/research/persona.", 159 | "ArXiv Link": "https://arxiv.org/abs/2407.17387", 160 | "PDF Link": "https://arxiv.org/pdf/2407.17387", 161 | "Upvotes": "15" 162 | }, 163 | { 164 | "Title": "Longhorn: State Space Models are Amortized Online Learners", 165 | "Abstract": "The most fundamental capability of modern AI methods such as Large Language\nModels (LLMs) is the ability to predict the next token in a long sequence of\ntokens, known as ``sequence modeling.\" Although the Transformers model is the\ncurrent dominant approach to sequence modeling, its quadratic computational\ncost with respect to sequence length is a significant drawback. State-space\nmodels (SSMs) offer a promising alternative due to their linear decoding\nefficiency and high parallelizability during training. However, existing SSMs\noften rely on seemingly ad hoc linear recurrence designs. In this work, we\nexplore SSM design through the lens of online learning, conceptualizing SSMs as\nmeta-modules for specific online learning problems. This approach links SSM\ndesign to formulating precise online learning objectives, with state transition\nrules derived from optimizing these objectives. Based on this insight, we\nintroduce a novel deep SSM architecture based on the implicit update for\noptimizing an online regression objective. Our experimental results show that\nour models outperform state-of-the-art SSMs, including the Mamba model, on\nstandard sequence modeling benchmarks and language modeling tasks.", 166 | "ArXiv Link": "https://arxiv.org/abs/2407.14207", 167 | "PDF Link": "https://arxiv.org/pdf/2407.14207", 168 | "Upvotes": "14" 169 | }, 170 | { 171 | "Title": "SV4D: Dynamic 3D Content Generation with Multi-Frame and Multi-View\n Consistency", 172 | "Abstract": "We present Stable Video 4D (SV4D), a latent video diffusion model for\nmulti-frame and multi-view consistent dynamic 3D content generation. Unlike\nprevious methods that rely on separately trained generative models for video\ngeneration and novel view synthesis, we design a unified diffusion model to\ngenerate novel view videos of dynamic 3D objects. Specifically, given a\nmonocular reference video, SV4D generates novel views for each video frame that\nare temporally consistent. We then use the generated novel view videos to\noptimize an implicit 4D representation (dynamic NeRF) efficiently, without the\nneed for cumbersome SDS-based optimization used in most prior works. To train\nour unified novel view video generation model, we curated a dynamic 3D object\ndataset from the existing Objaverse dataset. Extensive experimental results on\nmultiple datasets and user studies demonstrate SV4D's state-of-the-art\nperformance on novel-view video synthesis as well as 4D generation compared to\nprior works.", 173 | "ArXiv Link": "https://arxiv.org/abs/2407.17470", 174 | "PDF Link": "https://arxiv.org/pdf/2407.17470", 175 | "Upvotes": "12" 176 | }, 177 | { 178 | "Title": "Learning to Manipulate Anywhere: A Visual Generalizable Framework For\n Reinforcement Learning", 179 | "Abstract": "Can we endow visuomotor robots with generalization capabilities to operate in\ndiverse open-world scenarios? In this paper, we propose Maniwhere, a\ngeneralizable framework tailored for visual reinforcement learning, enabling\nthe trained robot policies to generalize across a combination of multiple\nvisual disturbance types. Specifically, we introduce a multi-view\nrepresentation learning approach fused with Spatial Transformer Network (STN)\nmodule to capture shared semantic information and correspondences among\ndifferent viewpoints. In addition, we employ a curriculum-based randomization\nand augmentation approach to stabilize the RL training process and strengthen\nthe visual generalization ability. To exhibit the effectiveness of Maniwhere,\nwe meticulously design 8 tasks encompassing articulate objects, bi-manual, and\ndexterous hand manipulation tasks, demonstrating Maniwhere's strong visual\ngeneralization and sim2real transfer abilities across 3 hardware platforms. Our\nexperiments show that Maniwhere significantly outperforms existing\nstate-of-the-art methods. Videos are provided at\nhttps://gemcollector.github.io/maniwhere/.", 180 | "ArXiv Link": "https://arxiv.org/abs/2407.15815", 181 | "PDF Link": "https://arxiv.org/pdf/2407.15815", 182 | "Upvotes": "10" 183 | }, 184 | { 185 | "Title": "ViPer: Visual Personalization of Generative Models via Individual\n Preference Learning", 186 | "Abstract": "Different users find different images generated for the same prompt\ndesirable. This gives rise to personalized image generation which involves\ncreating images aligned with an individual's visual preference. Current\ngenerative models are, however, unpersonalized, as they are tuned to produce\noutputs that appeal to a broad audience. Using them to generate images aligned\nwith individual users relies on iterative manual prompt engineering by the user\nwhich is inefficient and undesirable. We propose to personalize the image\ngeneration process by first capturing the generic preferences of the user in a\none-time process by inviting them to comment on a small selection of images,\nexplaining why they like or dislike each. Based on these comments, we infer a\nuser's structured liked and disliked visual attributes, i.e., their visual\npreference, using a large language model. These attributes are used to guide a\ntext-to-image model toward producing images that are tuned towards the\nindividual user's visual preference. Through a series of user studies and large\nlanguage model guided evaluations, we demonstrate that the proposed method\nresults in generations that are well aligned with individual users' visual\npreferences.", 187 | "ArXiv Link": "https://arxiv.org/abs/2407.17365", 188 | "PDF Link": "https://arxiv.org/pdf/2407.17365", 189 | "Upvotes": "10" 190 | }, 191 | { 192 | "Title": "MOMAland: A Set of Benchmarks for Multi-Objective Multi-Agent\n Reinforcement Learning", 193 | "Abstract": "Many challenging tasks such as managing traffic systems, electricity grids,\nor supply chains involve complex decision-making processes that must balance\nmultiple conflicting objectives and coordinate the actions of various\nindependent decision-makers (DMs). One perspective for formalising and\naddressing such tasks is multi-objective multi-agent reinforcement learning\n(MOMARL). MOMARL broadens reinforcement learning (RL) to problems with multiple\nagents each needing to consider multiple objectives in their learning process.\nIn reinforcement learning research, benchmarks are crucial in facilitating\nprogress, evaluation, and reproducibility. The significance of benchmarks is\nunderscored by the existence of numerous benchmark frameworks developed for\nvarious RL paradigms, including single-agent RL (e.g., Gymnasium), multi-agent\nRL (e.g., PettingZoo), and single-agent multi-objective RL (e.g.,\nMO-Gymnasium). To support the advancement of the MOMARL field, we introduce\nMOMAland, the first collection of standardised environments for multi-objective\nmulti-agent reinforcement learning. MOMAland addresses the need for\ncomprehensive benchmarking in this emerging field, offering over 10 diverse\nenvironments that vary in the number of agents, state representations, reward\nstructures, and utility considerations. To provide strong baselines for future\nresearch, MOMAland also includes algorithms capable of learning policies in\nsuch settings.", 194 | "ArXiv Link": "https://arxiv.org/abs/2407.16312", 195 | "PDF Link": "https://arxiv.org/pdf/2407.16312", 196 | "Upvotes": "9" 197 | }, 198 | { 199 | "Title": "Scalify: scale propagation for efficient low-precision LLM training", 200 | "Abstract": "Low-precision formats such as float8 have been introduced in machine learning\naccelerated hardware to improve computational efficiency for large language\nmodels training and inference. Nevertheless, adoption by the ML community has\nbeen slowed down by the complex, and sometimes brittle, techniques required to\nmatch higher precision training accuracy. In this work, we present Scalify, a\nend-to-end scale propagation paradigm for computational graphs, generalizing\nand formalizing existing tensor scaling methods. Experiment results show that\nScalify supports out-of-the-box float8 matrix multiplication and gradients\nrepresentation, as well as float16 optimizer state storage. Our JAX\nimplementation of Scalify is open-sourced at\nhttps://github.com/graphcore-research/jax-scalify", 201 | "ArXiv Link": "https://arxiv.org/abs/2407.17353", 202 | "PDF Link": "https://arxiv.org/pdf/2407.17353", 203 | "Upvotes": "9" 204 | }, 205 | { 206 | "Title": "DistilDIRE: A Small, Fast, Cheap and Lightweight Diffusion Synthesized\n Deepfake Detection", 207 | "Abstract": "A dramatic influx of diffusion-generated images has marked recent years,\nposing unique challenges to current detection technologies. While the task of\nidentifying these images falls under binary classification, a seemingly\nstraightforward category, the computational load is significant when employing\nthe \"reconstruction then compare\" technique. This approach, known as DIRE\n(Diffusion Reconstruction Error), not only identifies diffusion-generated\nimages but also detects those produced by GANs, highlighting the technique's\nbroad applicability. To address the computational challenges and improve\nefficiency, we propose distilling the knowledge embedded in diffusion models to\ndevelop rapid deepfake detection models. Our approach, aimed at creating a\nsmall, fast, cheap, and lightweight diffusion synthesized deepfake detector,\nmaintains robust performance while significantly reducing operational demands.\nMaintaining performance, our experimental results indicate an inference speed\n3.2 times faster than the existing DIRE framework. This advance not only\nenhances the practicality of deploying these systems in real-world settings but\nalso paves the way for future research endeavors that seek to leverage\ndiffusion model knowledge.", 208 | "ArXiv Link": "https://arxiv.org/abs/2406.00856", 209 | "PDF Link": "https://arxiv.org/pdf/2406.00856", 210 | "Upvotes": "8" 211 | }, 212 | { 213 | "Title": "DreamCar: Leveraging Car-specific Prior for in-the-wild 3D Car\n Reconstruction", 214 | "Abstract": "Self-driving industries usually employ professional artists to build\nexquisite 3D cars. However, it is expensive to craft large-scale digital\nassets. Since there are already numerous datasets available that contain a vast\nnumber of images of cars, we focus on reconstructing high-quality 3D car models\nfrom these datasets. However, these datasets only contain one side of cars in\nthe forward-moving scene. We try to use the existing generative models to\nprovide more supervision information, but they struggle to generalize well in\ncars since they are trained on synthetic datasets not car-specific. In\naddition, The reconstructed 3D car texture misaligns due to a large error in\ncamera pose estimation when dealing with in-the-wild images. These restrictions\nmake it challenging for previous methods to reconstruct complete 3D cars. To\naddress these problems, we propose a novel method, named DreamCar, which can\nreconstruct high-quality 3D cars given a few images even a single image. To\ngeneralize the generative model, we collect a car dataset, named Car360, with\nover 5,600 vehicles. With this dataset, we make the generative model more\nrobust to cars. We use this generative prior specific to the car to guide its\nreconstruction via Score Distillation Sampling. To further complement the\nsupervision information, we utilize the geometric and appearance symmetry of\ncars. Finally, we propose a pose optimization method that rectifies poses to\ntackle texture misalignment. Extensive experiments demonstrate that our method\nsignificantly outperforms existing methods in reconstructing high-quality 3D\ncars. https://xiaobiaodu.github.io/dreamcar-project/{Our code is\navailable.}", 215 | "ArXiv Link": "https://arxiv.org/abs/2407.16988", 216 | "PDF Link": "https://arxiv.org/pdf/2407.16988", 217 | "Upvotes": "6" 218 | }, 219 | { 220 | "Title": "CoD, Towards an Interpretable Medical Agent using Chain of Diagnosis", 221 | "Abstract": "The field of medical diagnosis has undergone a significant transformation\nwith the advent of large language models (LLMs), yet the challenges of\ninterpretability within these models remain largely unaddressed. This study\nintroduces Chain-of-Diagnosis (CoD) to enhance the interpretability of\nLLM-based medical diagnostics. CoD transforms the diagnostic process into a\ndiagnostic chain that mirrors a physician's thought process, providing a\ntransparent reasoning pathway. Additionally, CoD outputs the disease confidence\ndistribution to ensure transparency in decision-making. This interpretability\nmakes model diagnostics controllable and aids in identifying critical symptoms\nfor inquiry through the entropy reduction of confidences. With CoD, we\ndeveloped DiagnosisGPT, capable of diagnosing 9604 diseases. Experimental\nresults demonstrate that DiagnosisGPT outperforms other LLMs on diagnostic\nbenchmarks. Moreover, DiagnosisGPT provides interpretability while ensuring\ncontrollability in diagnostic rigor.", 222 | "ArXiv Link": "https://arxiv.org/abs/2407.13301", 223 | "PDF Link": "https://arxiv.org/pdf/2407.13301", 224 | "Upvotes": "52" 225 | }, 226 | { 227 | "Title": "KAN or MLP: A Fairer Comparison", 228 | "Abstract": "This paper does not introduce a novel method. Instead, it offers a fairer and\nmore comprehensive comparison of KAN and MLP models across various tasks,\nincluding machine learning, computer vision, audio processing, natural language\nprocessing, and symbolic formula representation. Specifically, we control the\nnumber of parameters and FLOPs to compare the performance of KAN and MLP. Our\nmain observation is that, except for symbolic formula representation tasks, MLP\ngenerally outperforms KAN. We also conduct ablation studies on KAN and find\nthat its advantage in symbolic formula representation mainly stems from its\nB-spline activation function. When B-spline is applied to MLP, performance in\nsymbolic formula representation significantly improves, surpassing or matching\nthat of KAN. However, in other tasks where MLP already excels over KAN,\nB-spline does not substantially enhance MLP's performance. Furthermore, we find\nthat KAN's forgetting issue is more severe than that of MLP in a standard\nclass-incremental continual learning setting, which differs from the findings\nreported in the KAN paper. We hope these results provide insights for future\nresearch on KAN and other MLP alternatives. Project link:\nhttps://github.com/yu-rp/KANbeFair", 229 | "ArXiv Link": "https://arxiv.org/abs/2407.16674", 230 | "PDF Link": "https://arxiv.org/pdf/2407.16674", 231 | "Upvotes": "34" 232 | }, 233 | { 234 | "Title": "MovieDreamer: Hierarchical Generation for Coherent Long Visual Sequence", 235 | "Abstract": "Recent advancements in video generation have primarily leveraged diffusion\nmodels for short-duration content. However, these approaches often fall short\nin modeling complex narratives and maintaining character consistency over\nextended periods, which is essential for long-form video production like\nmovies. We propose MovieDreamer, a novel hierarchical framework that integrates\nthe strengths of autoregressive models with diffusion-based rendering to\npioneer long-duration video generation with intricate plot progressions and\nhigh visual fidelity. Our approach utilizes autoregressive models for global\nnarrative coherence, predicting sequences of visual tokens that are\nsubsequently transformed into high-quality video frames through diffusion\nrendering. This method is akin to traditional movie production processes, where\ncomplex stories are factorized down into manageable scene capturing. Further,\nwe employ a multimodal script that enriches scene descriptions with detailed\ncharacter information and visual style, enhancing continuity and character\nidentity across scenes. We present extensive experiments across various movie\ngenres, demonstrating that our approach not only achieves superior visual and\nnarrative quality but also effectively extends the duration of generated\ncontent significantly beyond current capabilities. Homepage:\nhttps://aim-uofa.github.io/MovieDreamer/.", 236 | "ArXiv Link": "https://arxiv.org/abs/2407.16655", 237 | "PDF Link": "https://arxiv.org/pdf/2407.16655", 238 | "Upvotes": "25" 239 | }, 240 | { 241 | "Title": "T2V-CompBench: A Comprehensive Benchmark for Compositional Text-to-video\n Generation", 242 | "Abstract": "Text-to-video (T2V) generation models have advanced significantly, yet their\nability to compose different objects, attributes, actions, and motions into a\nvideo remains unexplored. Previous text-to-video benchmarks also neglect this\nimportant ability for evaluation. In this work, we conduct the first systematic\nstudy on compositional text-to-video generation. We propose T2V-CompBench, the\nfirst benchmark tailored for compositional text-to-video generation.\nT2V-CompBench encompasses diverse aspects of compositionality, including\nconsistent attribute binding, dynamic attribute binding, spatial relationships,\nmotion binding, action binding, object interactions, and generative numeracy.\nWe further carefully design evaluation metrics of MLLM-based metrics,\ndetection-based metrics, and tracking-based metrics, which can better reflect\nthe compositional text-to-video generation quality of seven proposed categories\nwith 700 text prompts. The effectiveness of the proposed metrics is verified by\ncorrelation with human evaluations. We also benchmark various text-to-video\ngenerative models and conduct in-depth analysis across different models and\ndifferent compositional categories. We find that compositional text-to-video\ngeneration is highly challenging for current models, and we hope that our\nattempt will shed light on future research in this direction.", 243 | "ArXiv Link": "https://arxiv.org/abs/2407.14505", 244 | "PDF Link": "https://arxiv.org/pdf/2407.14505", 245 | "Upvotes": "21" 246 | }, 247 | { 248 | "Title": "OutfitAnyone: Ultra-high Quality Virtual Try-On for Any Clothing and Any\n Person", 249 | "Abstract": "Virtual Try-On (VTON) has become a transformative technology, empowering\nusers to experiment with fashion without ever having to physically try on\nclothing. However, existing methods often struggle with generating\nhigh-fidelity and detail-consistent results. While diffusion models, such as\nStable Diffusion series, have shown their capability in creating high-quality\nand photorealistic images, they encounter formidable challenges in conditional\ngeneration scenarios like VTON. Specifically, these models struggle to maintain\na balance between control and consistency when generating images for virtual\nclothing trials. OutfitAnyone addresses these limitations by leveraging a\ntwo-stream conditional diffusion model, enabling it to adeptly handle garment\ndeformation for more lifelike results. It distinguishes itself with\nscalability-modulating factors such as pose, body shape and broad\napplicability, extending from anime to in-the-wild images. OutfitAnyone's\nperformance in diverse scenarios underscores its utility and readiness for\nreal-world deployment. For more details and animated results, please see\nhttps://humanaigc.github.io/outfit-anyone/.", 250 | "ArXiv Link": "https://arxiv.org/abs/2407.16224", 251 | "PDF Link": "https://arxiv.org/pdf/2407.16224", 252 | "Upvotes": "20" 253 | }, 254 | { 255 | "Title": "INF-LLaVA: Dual-perspective Perception for High-Resolution Multimodal\n Large Language Model", 256 | "Abstract": "With advancements in data availability and computing resources, Multimodal\nLarge Language Models (MLLMs) have showcased capabilities across various\nfields. However, the quadratic complexity of the vision encoder in MLLMs\nconstrains the resolution of input images. Most current approaches mitigate\nthis issue by cropping high-resolution images into smaller sub-images, which\nare then processed independently by the vision encoder. Despite capturing\nsufficient local details, these sub-images lack global context and fail to\ninteract with one another. To address this limitation, we propose a novel MLLM,\nINF-LLaVA, designed for effective high-resolution image perception. INF-LLaVA\nincorporates two innovative components. First, we introduce a Dual-perspective\nCropping Module (DCM), which ensures that each sub-image contains continuous\ndetails from a local perspective and comprehensive information from a global\nperspective. Second, we introduce Dual-perspective Enhancement Module (DEM) to\nenable the mutual enhancement of global and local features, allowing INF-LLaVA\nto effectively process high-resolution images by simultaneously capturing\ndetailed local information and comprehensive global context. Extensive ablation\nstudies validate the effectiveness of these components, and experiments on a\ndiverse set of benchmarks demonstrate that INF-LLaVA outperforms existing\nMLLMs. Code and pretrained model are available at\nhttps://github.com/WeihuangLin/INF-LLaVA.", 257 | "ArXiv Link": "https://arxiv.org/abs/2407.16198", 258 | "PDF Link": "https://arxiv.org/pdf/2407.16198", 259 | "Upvotes": "12" 260 | }, 261 | { 262 | "Title": "F-HOI: Toward Fine-grained Semantic-Aligned 3D Human-Object Interactions", 263 | "Abstract": "Existing 3D human object interaction (HOI) datasets and models simply align\nglobal descriptions with the long HOI sequence, while lacking a detailed\nunderstanding of intermediate states and the transitions between states. In\nthis paper, we argue that fine-grained semantic alignment, which utilizes\nstate-level descriptions, offers a promising paradigm for learning semantically\nrich HOI representations. To achieve this, we introduce Semantic-HOI, a new\ndataset comprising over 20K paired HOI states with fine-grained descriptions\nfor each HOI state and the body movements that happen between two consecutive\nstates. Leveraging the proposed dataset, we design three state-level HOI tasks\nto accomplish fine-grained semantic alignment within the HOI sequence.\nAdditionally, we propose a unified model called F-HOI, designed to leverage\nmultimodal instructions and empower the Multi-modal Large Language Model to\nefficiently handle diverse HOI tasks. F-HOI offers multiple advantages: (1) It\nemploys a unified task formulation that supports the use of versatile\nmultimodal inputs. (2) It maintains consistency in HOI across 2D, 3D, and\nlinguistic spaces. (3) It utilizes fine-grained textual supervision for direct\noptimization, avoiding intricate modeling of HOI states. Extensive experiments\nreveal that F-HOI effectively aligns HOI states with fine-grained semantic\ndescriptions, adeptly tackling understanding, reasoning, generation, and\nreconstruction tasks.", 264 | "ArXiv Link": "https://arxiv.org/abs/2407.12435", 265 | "PDF Link": "https://arxiv.org/pdf/2407.12435", 266 | "Upvotes": "10" 267 | }, 268 | { 269 | "Title": "A Simulation Benchmark for Autonomous Racing with Large-Scale Human Data", 270 | "Abstract": "Despite the availability of international prize-money competitions, scaled\nvehicles, and simulation environments, research on autonomous racing and the\ncontrol of sports cars operating close to the limit of handling has been\nlimited by the high costs of vehicle acquisition and management, as well as the\nlimited physics accuracy of open-source simulators. In this paper, we propose a\nracing simulation platform based on the simulator Assetto Corsa to test,\nvalidate, and benchmark autonomous driving algorithms, including reinforcement\nlearning (RL) and classical Model Predictive Control (MPC), in realistic and\nchallenging scenarios. Our contributions include the development of this\nsimulation platform, several state-of-the-art algorithms tailored to the racing\nenvironment, and a comprehensive dataset collected from human drivers.\nAdditionally, we evaluate algorithms in the offline RL setting. All the\nnecessary code (including environment and benchmarks), working examples,\ndatasets, and videos are publicly released and can be found at:\nhttps://assetto-corsa-gym.github.io.", 271 | "ArXiv Link": "https://arxiv.org/abs/2407.16680", 272 | "PDF Link": "https://arxiv.org/pdf/2407.16680", 273 | "Upvotes": "9" 274 | }, 275 | { 276 | "Title": "SIGMA: Sinkhorn-Guided Masked Video Modeling", 277 | "Abstract": "Video-based pretraining offers immense potential for learning strong visual\nrepresentations on an unprecedented scale. Recently, masked video modeling\nmethods have shown promising scalability, yet fall short in capturing\nhigher-level semantics due to reconstructing predefined low-level targets such\nas pixels. To tackle this, we present Sinkhorn-guided Masked Video Modelling\n(SIGMA), a novel video pretraining method that jointly learns the video model\nin addition to a target feature space using a projection network. However, this\nsimple modification means that the regular L2 reconstruction loss will lead to\ntrivial solutions as both networks are jointly optimized. As a solution, we\ndistribute features of space-time tubes evenly across a limited number of\nlearnable clusters. By posing this as an optimal transport problem, we enforce\nhigh entropy in the generated features across the batch, infusing semantic and\ntemporal meaning into the feature space. The resulting cluster assignments are\nused as targets for a symmetric prediction task where the video model predicts\ncluster assignment of the projection network and vice versa. Experimental\nresults on ten datasets across three benchmarks validate the effectiveness of\nSIGMA in learning more performant, temporally-aware, and robust video\nrepresentations improving upon state-of-the-art methods. Our project website\nwith code is available at: https://quva-lab.github.io/SIGMA.", 278 | "ArXiv Link": "https://arxiv.org/abs/2407.15447", 279 | "PDF Link": "https://arxiv.org/pdf/2407.15447", 280 | "Upvotes": "5" 281 | }, 282 | { 283 | "Title": "PrimeGuard: Safe and Helpful LLMs through Tuning-Free Routing", 284 | "Abstract": "Deploying language models (LMs) necessitates outputs to be both high-quality\nand compliant with safety guidelines. Although Inference-Time Guardrails (ITG)\noffer solutions that shift model output distributions towards compliance, we\nfind that current methods struggle in balancing safety with helpfulness. ITG\nMethods that safely address non-compliant queries exhibit lower helpfulness\nwhile those that prioritize helpfulness compromise on safety. We refer to this\ntrade-off as the guardrail tax, analogous to the alignment tax. To address\nthis, we propose PrimeGuard, a novel ITG method that utilizes structured\ncontrol flow.\n PrimeGuard routes requests to different self-instantiations of the LM with\nvarying instructions, leveraging its inherent instruction-following\ncapabilities and in-context learning. Our tuning-free approach dynamically\ncompiles system-designer guidelines for each query. We construct and release\nsafe-eval, a diverse red-team safety benchmark. Extensive evaluations\ndemonstrate that PrimeGuard, without fine-tuning, overcomes the guardrail tax\nby (1) significantly increasing resistance to iterative jailbreak attacks and\n(2) achieving state-of-the-art results in safety guardrailing while (3)\nmatching helpfulness scores of alignment-tuned models. Extensive evaluations\ndemonstrate that PrimeGuard, without fine-tuning, outperforms all competing\nbaselines and overcomes the guardrail tax by improving the fraction of safe\nresponses from 61% to 97% and increasing average helpfulness scores from 4.17\nto 4.29 on the largest models, while reducing attack success rate from 100% to\n8%.\n PrimeGuard implementation is available at\nhttps://github.com/dynamofl/PrimeGuard and safe-eval dataset is available at\nhttps://huggingface.co/datasets/dynamoai/safe_eval.", 285 | "ArXiv Link": "https://arxiv.org/abs/2407.16318", 286 | "PDF Link": "https://arxiv.org/pdf/2407.16318", 287 | "Upvotes": "4" 288 | }, 289 | { 290 | "Title": "Cross Anything: General Quadruped Robot Navigation through Complex\n Terrains", 291 | "Abstract": "The application of vision-language models (VLMs) has achieved impressive\nsuccess in various robotics tasks, but there are few explorations for\nfoundation models used in quadruped robot navigation. We introduce Cross\nAnything System (CAS), an innovative system composed of a high-level reasoning\nmodule and a low-level control policy, enabling the robot to navigate across\ncomplex 3D terrains and reach the goal position. For high-level reasoning and\nmotion planning, we propose a novel algorithmic system taking advantage of a\nVLM, with a design of task decomposition and a closed-loop sub-task execution\nmechanism. For low-level locomotion control, we utilize the Probability\nAnnealing Selection (PAS) method to train a control policy by reinforcement\nlearning. Numerous experiments show that our whole system can accurately and\nrobustly navigate across complex 3D terrains, and its strong generalization\nability ensures the applications in diverse indoor and outdoor scenarios and\nterrains. Project page: https://cross-anything.github.io/", 292 | "ArXiv Link": "https://arxiv.org/abs/2407.16412", 293 | "PDF Link": "https://arxiv.org/pdf/2407.16412", 294 | "Upvotes": "3" 295 | }, 296 | { 297 | "Title": "SlowFast-LLaVA: A Strong Training-Free Baseline for Video Large Language\n Models", 298 | "Abstract": "We propose SlowFast-LLaVA (or SF-LLaVA for short), a training-free video\nlarge language model (LLM) that can jointly capture the detailed spatial\nsemantics and long-range temporal context without exceeding the token budget of\ncommonly used LLMs. This is realized by using a two-stream SlowFast design of\ninputs for Video LLMs to aggregate features from sampled video frames in an\neffective way. Specifically, the Slow pathway extracts features at a low frame\nrate while keeping as many spatial details as possible (e.g., with 24x24\ntokens), and the Fast pathway operates on a high frame rate but uses a larger\nspatial pooling stride (e.g., downsampling 6x) to focus on the motion cues. As\na result, this design allows us to adequately capture both spatial and temporal\nfeatures that are beneficial for understanding details along the video.\nExperimental results show that SF-LLaVA outperforms existing training-free\nmethods on a wide range of video tasks. On some benchmarks, it achieves\ncomparable or even better performance compared to state-of-the-art Video LLMs\nthat are fine-tuned on video datasets.", 299 | "ArXiv Link": "https://arxiv.org/abs/2407.15841", 300 | "PDF Link": "https://arxiv.org/pdf/2407.15841", 301 | "Upvotes": "32" 302 | }, 303 | { 304 | "Title": "NNsight and NDIF: Democratizing Access to Foundation Model Internals", 305 | "Abstract": "The enormous scale of state-of-the-art foundation models has limited their\naccessibility to scientists, because customized experiments at large model\nsizes require costly hardware and complex engineering that is impractical for\nmost researchers. To alleviate these problems, we introduce NNsight, an\nopen-source Python package with a simple, flexible API that can express\ninterventions on any PyTorch model by building computation graphs. We also\nintroduce NDIF, a collaborative research platform providing researchers access\nto foundation-scale LLMs via the NNsight API. Code, documentation, and\ntutorials are available at https://www.nnsight.net.", 306 | "ArXiv Link": "https://arxiv.org/abs/2407.14561", 307 | "PDF Link": "https://arxiv.org/pdf/2407.14561", 308 | "Upvotes": "32" 309 | }, 310 | { 311 | "Title": "Knowledge Mechanisms in Large Language Models: A Survey and Perspective", 312 | "Abstract": "Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.", 313 | "ArXiv Link": "https://arxiv.org/abs/2407.15017", 314 | "PDF Link": "https://arxiv.org/pdf/2407.15017", 315 | "Upvotes": "31" 316 | }, 317 | { 318 | "Title": "Compact Language Models via Pruning and Knowledge Distillation", 319 | "Abstract": "Large language models (LLMs) targeting different deployment scales and sizes\nare currently produced by training each variant from scratch; this is extremely\ncompute-intensive. In this paper, we investigate if pruning an existing LLM and\nthen re-training it with a fraction (<3%) of the original training data can be\na suitable alternative to repeated, full retraining. To this end, we develop a\nset of practical and effective compression best practices for LLMs that combine\ndepth, width, attention and MLP pruning with knowledge distillation-based\nretraining; we arrive at these best practices through a detailed empirical\nexploration of pruning strategies for each axis, methods to combine axes,\ndistillation strategies, and search techniques for arriving at optimal\ncompressed architectures. We use this guide to compress the Nemotron-4 family\nof LLMs by a factor of 2-4x, and compare their performance to similarly-sized\nmodels on a variety of language modeling tasks. Deriving 8B and 4B models from\nan already pretrained 15B model using our approach requires up to 40x fewer\ntraining tokens per model compared to training from scratch; this results in\ncompute cost savings of 1.8x for training the full model family (15B, 8B, and\n4B). Minitron models exhibit up to a 16% improvement in MMLU scores compared to\ntraining from scratch, perform comparably to other community models such as\nMistral 7B, Gemma 7B and Llama-3 8B, and outperform state-of-the-art\ncompression techniques from the literature. We have open-sourced Minitron model\nweights on Huggingface, with corresponding supplementary material including\nexample code available on GitHub.", 320 | "ArXiv Link": "https://arxiv.org/abs/2407.14679", 321 | "PDF Link": "https://arxiv.org/pdf/2407.14679", 322 | "Upvotes": "29" 323 | }, 324 | { 325 | "Title": "POGEMA: A Benchmark Platform for Cooperative Multi-Agent Navigation", 326 | "Abstract": "Multi-agent reinforcement learning (MARL) has recently excelled in solving\nchallenging cooperative and competitive multi-agent problems in various\nenvironments with, mostly, few agents and full observability. Moreover, a range\nof crucial robotics-related tasks, such as multi-robot navigation and obstacle\navoidance, that have been conventionally approached with the classical\nnon-learnable methods (e.g., heuristic search) is currently suggested to be\nsolved by the learning-based or hybrid methods. Still, in this domain, it is\nhard, not to say impossible, to conduct a fair comparison between classical,\nlearning-based, and hybrid approaches due to the lack of a unified framework\nthat supports both learning and evaluation. To this end, we introduce POGEMA, a\nset of comprehensive tools that includes a fast environment for learning, a\ngenerator of problem instances, the collection of pre-defined ones, a\nvisualization toolkit, and a benchmarking tool that allows automated\nevaluation. We introduce and specify an evaluation protocol defining a range of\ndomain-related metrics computed on the basics of the primary evaluation\nindicators (such as success rate and path length), allowing a fair multi-fold\ncomparison. The results of such a comparison, which involves a variety of\nstate-of-the-art MARL, search-based, and hybrid methods, are presented.", 327 | "ArXiv Link": "https://arxiv.org/abs/2407.14931", 328 | "PDF Link": "https://arxiv.org/pdf/2407.14931", 329 | "Upvotes": "18" 330 | }, 331 | { 332 | "Title": "VideoGameBunny: Towards vision assistants for video games", 333 | "Abstract": "Large multimodal models (LMMs) hold substantial promise across various\ndomains, from personal assistance in daily tasks to sophisticated applications\nlike medical diagnostics. However, their capabilities have limitations in the\nvideo game domain, such as challenges with scene understanding, hallucinations,\nand inaccurate descriptions of video game content, especially in open-source\nmodels. This paper describes the development of VideoGameBunny, a LLaVA-style\nmodel based on Bunny, specifically tailored for understanding images from video\ngames. We release intermediate checkpoints, training logs, and an extensive\ndataset comprising 185,259 video game images from 413 titles, along with\n389,565 image-instruction pairs that include image captions, question-answer\npairs, and a JSON representation of 16 elements of 136,974 images. Our\nexperiments show that our high quality game-related data has the potential to\nmake a relatively small model outperform the much larger state-of-the-art model\nLLaVa-1.6-34b (which has more than 4x the number of parameters). Our study\npaves the way for future research in video game understanding on tasks such as\nplaying, commentary, and debugging. Code and data are available at\nhttps://videogamebunny.github.io/", 334 | "ArXiv Link": "https://arxiv.org/abs/2407.15295", 335 | "PDF Link": "https://arxiv.org/pdf/2407.15295", 336 | "Upvotes": "18" 337 | }, 338 | { 339 | "Title": "LongVideoBench: A Benchmark for Long-context Interleaved Video-Language\n Understanding", 340 | "Abstract": "Large multimodal models (LMMs) are processing increasingly longer and richer\ninputs. Albeit the progress, few public benchmark is available to measure such\ndevelopment. To mitigate this gap, we introduce LongVideoBench, a\nquestion-answering benchmark that features video-language interleaved inputs up\nto an hour long. Our benchmark includes 3,763 varying-length web-collected\nvideos with their subtitles across diverse themes, designed to comprehensively\nevaluate LMMs on long-term multimodal understanding. To achieve this, we\ninterpret the primary challenge as to accurately retrieve and reason over\ndetailed multimodal information from long inputs. As such, we formulate a novel\nvideo question-answering task termed referring reasoning. Specifically, as part\nof the question, it contains a referring query that references related video\ncontexts, called referred context. The model is then required to reason over\nrelevant video details from the referred context. Following the paradigm of\nreferring reasoning, we curate 6,678 human-annotated multiple-choice questions\nin 17 fine-grained categories, establishing one of the most comprehensive\nbenchmarks for long-form video understanding. Evaluations suggest that the\nLongVideoBench presents significant challenges even for the most advanced\nproprietary models (e.g. GPT-4o, Gemini-1.5-Pro, GPT-4-Turbo), while their\nopen-source counterparts show an even larger performance gap. In addition, our\nresults indicate that model performance on the benchmark improves only when\nthey are capable of processing more frames, positioning LongVideoBench as a\nvaluable benchmark for evaluating future-generation long-context LMMs.", 341 | "ArXiv Link": "https://arxiv.org/abs/2407.15754", 342 | "PDF Link": "https://arxiv.org/pdf/2407.15754", 343 | "Upvotes": "16" 344 | }, 345 | { 346 | "Title": "BoostMVSNeRFs: Boosting MVS-based NeRFs to Generalizable View Synthesis\n in Large-scale Scenes", 347 | "Abstract": "While Neural Radiance Fields (NeRFs) have demonstrated exceptional quality,\ntheir protracted training duration remains a limitation. Generalizable and\nMVS-based NeRFs, although capable of mitigating training time, often incur\ntradeoffs in quality. This paper presents a novel approach called BoostMVSNeRFs\nto enhance the rendering quality of MVS-based NeRFs in large-scale scenes. We\nfirst identify limitations in MVS-based NeRF methods, such as restricted\nviewport coverage and artifacts due to limited input views. Then, we address\nthese limitations by proposing a new method that selects and combines multiple\ncost volumes during volume rendering. Our method does not require training and\ncan adapt to any MVS-based NeRF methods in a feed-forward fashion to improve\nrendering quality. Furthermore, our approach is also end-to-end trainable,\nallowing fine-tuning on specific scenes. We demonstrate the effectiveness of\nour method through experiments on large-scale datasets, showing significant\nrendering quality improvements in large-scale scenes and unbounded outdoor\nscenarios. We release the source code of BoostMVSNeRFs at\nhttps://su-terry.github.io/BoostMVSNeRFs/.", 348 | "ArXiv Link": "https://arxiv.org/abs/2407.15848", 349 | "PDF Link": "https://arxiv.org/pdf/2407.15848", 350 | "Upvotes": "15" 351 | }, 352 | { 353 | "Title": "BOND: Aligning LLMs with Best-of-N Distillation", 354 | "Abstract": "Reinforcement learning from human feedback (RLHF) is a key driver of quality\nand safety in state-of-the-art large language models. Yet, a surprisingly\nsimple and strong inference-time strategy is Best-of-N sampling that selects\nthe best generation among N candidates. In this paper, we propose Best-of-N\nDistillation (BOND), a novel RLHF algorithm that seeks to emulate Best-of-N but\nwithout its significant computational overhead at inference time. Specifically,\nBOND is a distribution matching algorithm that forces the distribution of\ngenerations from the policy to get closer to the Best-of-N distribution. We use\nthe Jeffreys divergence (a linear combination of forward and backward KL) to\nbalance between mode-covering and mode-seeking behavior, and derive an\niterative formulation that utilizes a moving anchor for efficiency. We\ndemonstrate the effectiveness of our approach and several design choices\nthrough experiments on abstractive summarization and Gemma models. Aligning\nGemma policies with BOND outperforms other RLHF algorithms by improving results\non several benchmarks.", 355 | "ArXiv Link": "https://arxiv.org/abs/2407.14622", 356 | "PDF Link": "https://arxiv.org/pdf/2407.14622", 357 | "Upvotes": "11" 358 | }, 359 | { 360 | "Title": "Consent in Crisis: The Rapid Decline of the AI Data Commons", 361 | "Abstract": "General-purpose artificial intelligence (AI) systems are built on massive\nswathes of public web data, assembled into corpora such as C4, RefinedWeb, and\nDolma. To our knowledge, we conduct the first, large-scale, longitudinal audit\nof the consent protocols for the web domains underlying AI training corpora.\nOur audit of 14,000 web domains provides an expansive view of crawlable web\ndata and how consent preferences to use it are changing over time. We observe a\nproliferation of AI-specific clauses to limit use, acute differences in\nrestrictions on AI developers, as well as general inconsistencies between\nwebsites' expressed intentions in their Terms of Service and their robots.txt.\nWe diagnose these as symptoms of ineffective web protocols, not designed to\ncope with the widespread re-purposing of the internet for AI. Our longitudinal\nanalyses show that in a single year (2023-2024) there has been a rapid\ncrescendo of data restrictions from web sources, rendering ~5%+ of all tokens\nin C4, or 28%+ of the most actively maintained, critical sources in C4, fully\nrestricted from use. For Terms of Service crawling restrictions, a full 45% of\nC4 is now restricted. If respected or enforced, these restrictions are rapidly\nbiasing the diversity, freshness, and scaling laws for general-purpose AI\nsystems. We hope to illustrate the emerging crisis in data consent, foreclosing\nmuch of the open web, not only for commercial AI, but non-commercial AI and\nacademic purposes.", 362 | "ArXiv Link": "https://arxiv.org/abs/2407.14933", 363 | "PDF Link": "https://arxiv.org/pdf/2407.14933", 364 | "Upvotes": "9" 365 | }, 366 | { 367 | "Title": "Artist: Aesthetically Controllable Text-Driven Stylization without\n Training", 368 | "Abstract": "Diffusion models entangle content and style generation during the denoising\nprocess, leading to undesired content modification when directly applied to\nstylization tasks. Existing methods struggle to effectively control the\ndiffusion model to meet the aesthetic-level requirements for stylization. In\nthis paper, we introduce Artist, a training-free approach that\naesthetically controls the content and style generation of a pretrained\ndiffusion model for text-driven stylization. Our key insight is to disentangle\nthe denoising of content and style into separate diffusion processes while\nsharing information between them. We propose simple yet effective content and\nstyle control methods that suppress style-irrelevant content generation,\nresulting in harmonious stylization results. Extensive experiments demonstrate\nthat our method excels at achieving aesthetic-level stylization requirements,\npreserving intricate details in the content image and aligning well with the\nstyle prompt. Furthermore, we showcase the highly controllability of the\nstylization strength from various perspectives. Code will be released, project\nhome page: https://DiffusionArtist.github.io", 369 | "ArXiv Link": "https://arxiv.org/abs/2407.15842", 370 | "PDF Link": "https://arxiv.org/pdf/2407.15842", 371 | "Upvotes": "9" 372 | }, 373 | { 374 | "Title": "Cinemo: Consistent and Controllable Image Animation with Motion\n Diffusion Models", 375 | "Abstract": "Diffusion models have achieved great progress in image animation due to\npowerful generative capabilities. However, maintaining spatio-temporal\nconsistency with detailed information from the input static image over time\n(e.g., style, background, and object of the input static image) and ensuring\nsmoothness in animated video narratives guided by textual prompts still remains\nchallenging. In this paper, we introduce Cinemo, a novel image animation\napproach towards achieving better motion controllability, as well as stronger\ntemporal consistency and smoothness. In general, we propose three effective\nstrategies at the training and inference stages of Cinemo to accomplish our\ngoal. At the training stage, Cinemo focuses on learning the distribution of\nmotion residuals, rather than directly predicting subsequent via a motion\ndiffusion model. Additionally, a structural similarity index-based strategy is\nproposed to enable Cinemo to have better controllability of motion intensity.\nAt the inference stage, a noise refinement technique based on discrete cosine\ntransformation is introduced to mitigate sudden motion changes. Such three\nstrategies enable Cinemo to produce highly consistent, smooth, and\nmotion-controllable results. Compared to previous methods, Cinemo offers\nsimpler and more precise user controllability. Extensive experiments against\nseveral state-of-the-art methods, including both commercial tools and research\napproaches, across multiple metrics, demonstrate the effectiveness and\nsuperiority of our proposed approach.", 376 | "ArXiv Link": "https://arxiv.org/abs/2407.15642", 377 | "PDF Link": "https://arxiv.org/pdf/2407.15642", 378 | "Upvotes": "9" 379 | }, 380 | { 381 | "Title": "HoloDreamer: Holistic 3D Panoramic World Generation from Text\n Descriptions", 382 | "Abstract": "3D scene generation is in high demand across various domains, including\nvirtual reality, gaming, and the film industry. Owing to the powerful\ngenerative capabilities of text-to-image diffusion models that provide reliable\npriors, the creation of 3D scenes using only text prompts has become viable,\nthereby significantly advancing researches in text-driven 3D scene generation.\nIn order to obtain multiple-view supervision from 2D diffusion models,\nprevailing methods typically employ the diffusion model to generate an initial\nlocal image, followed by iteratively outpainting the local image using\ndiffusion models to gradually generate scenes. Nevertheless, these\noutpainting-based approaches prone to produce global inconsistent scene\ngeneration results without high degree of completeness, restricting their\nbroader applications. To tackle these problems, we introduce HoloDreamer, a\nframework that first generates high-definition panorama as a holistic\ninitialization of the full 3D scene, then leverage 3D Gaussian Splatting\n(3D-GS) to quickly reconstruct the 3D scene, thereby facilitating the creation\nof view-consistent and fully enclosed 3D scenes. Specifically, we propose\nStylized Equirectangular Panorama Generation, a pipeline that combines multiple\ndiffusion models to enable stylized and detailed equirectangular panorama\ngeneration from complex text prompts. Subsequently, Enhanced Two-Stage Panorama\nReconstruction is introduced, conducting a two-stage optimization of 3D-GS to\ninpaint the missing region and enhance the integrity of the scene.\nComprehensive experiments demonstrated that our method outperforms prior works\nin terms of overall visual consistency and harmony as well as reconstruction\nquality and rendering robustness when generating fully enclosed scenes.", 383 | "ArXiv Link": "https://arxiv.org/abs/2407.15187", 384 | "PDF Link": "https://arxiv.org/pdf/2407.15187", 385 | "Upvotes": "9" 386 | }, 387 | { 388 | "Title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?", 389 | "Abstract": "Language agents, built on top of language models (LMs), are systems that can\ninteract with complex environments, such as the open web. In this work, we\nexamine whether such agents can perform realistic and time-consuming tasks on\nthe web, e.g., monitoring real-estate markets or locating relevant nearby\nbusinesses. We introduce AssistantBench, a challenging new benchmark consisting\nof 214 realistic tasks that can be automatically evaluated, covering different\nscenarios and domains. We find that AssistantBench exposes the limitations of\ncurrent systems, including language models and retrieval-augmented language\nmodels, as no model reaches an accuracy of more than 25 points. While\nclosed-book LMs perform well, they exhibit low precision since they tend to\nhallucinate facts. State-of-the-art web agents reach a score of near zero.\nAdditionally, we introduce SeePlanAct (SPA), a new web agent that significantly\noutperforms previous agents, and an ensemble of SPA and closed-book models\nreaches the best overall performance. Moreover, we analyze failures of current\nsystems and highlight that web navigation remains a major challenge.", 390 | "ArXiv Link": "https://arxiv.org/abs/2407.15711", 391 | "PDF Link": "https://arxiv.org/pdf/2407.15711", 392 | "Upvotes": "7" 393 | }, 394 | { 395 | "Title": "MusiConGen: Rhythm and Chord Control for Transformer-Based Text-to-Music\n Generation", 396 | "Abstract": "Existing text-to-music models can produce high-quality audio with great\ndiversity. However, textual prompts alone cannot precisely control temporal\nmusical features such as chords and rhythm of the generated music. To address\nthis challenge, we introduce MusiConGen, a temporally-conditioned\nTransformer-based text-to-music model that builds upon the pretrained MusicGen\nframework. Our innovation lies in an efficient finetuning mechanism, tailored\nfor consumer-grade GPUs, that integrates automatically-extracted rhythm and\nchords as the condition signal. During inference, the condition can either be\nmusical features extracted from a reference audio signal, or be user-defined\nsymbolic chord sequence, BPM, and textual prompts. Our performance evaluation\non two datasets -- one derived from extracted features and the other from\nuser-created inputs -- demonstrates that MusiConGen can generate realistic\nbacking track music that aligns well with the specified conditions. We\nopen-source the code and model checkpoints, and provide audio examples online,\nhttps://musicongen.github.io/musicongen_demo/.", 397 | "ArXiv Link": "https://arxiv.org/abs/2407.15060", 398 | "PDF Link": "https://arxiv.org/pdf/2407.15060", 399 | "Upvotes": "7" 400 | }, 401 | { 402 | "Title": "Conditioned Language Policy: A General Framework for Steerable\n Multi-Objective Finetuning", 403 | "Abstract": "Reward-based finetuning is crucial for aligning language policies with\nintended behaviors (e.g., creativity and safety). A key challenge here is to\ndevelop steerable language models that trade-off multiple (conflicting)\nobjectives in a flexible and efficient manner. This paper presents Conditioned\nLanguage Policy (CLP), a general framework for finetuning language models on\nmultiple objectives. Building on techniques from multi-task training and\nparameter-efficient finetuning, CLP can learn steerable models that effectively\ntrade-off conflicting objectives at inference time. Notably, this does not\nrequire training or maintaining multiple models to achieve different trade-offs\nbetween the objectives. Through an extensive set of experiments and ablations,\nwe show that the CLP framework learns steerable models that outperform and\nPareto-dominate the current state-of-the-art approaches for multi-objective\nfinetuning.", 404 | "ArXiv Link": "https://arxiv.org/abs/2407.15762", 405 | "PDF Link": "https://arxiv.org/pdf/2407.15762", 406 | "Upvotes": "7" 407 | }, 408 | { 409 | "Title": "CGB-DM: Content and Graphic Balance Layout Generation with\n Transformer-based Diffusion Model", 410 | "Abstract": "Layout generation is the foundation task of intelligent design, which\nrequires the integration of visual aesthetics and harmonious expression of\ncontent delivery. However, existing methods still face challenges in generating\nprecise and visually appealing layouts, including blocking, overlap, or spatial\nmisalignment between layouts, which are closely related to the spatial\nstructure of graphic layouts. We find that these methods overly focus on\ncontent information and lack constraints on layout spatial structure, resulting\nin an imbalance of learning content-aware and graphic-aware features. To tackle\nthis issue, we propose Content and Graphic Balance Layout Generation with\nTransformer-based Diffusion Model (CGB-DM). Specifically, we first design a\nregulator that balances the predicted content and graphic weight, overcoming\nthe tendency of paying more attention to the content on canvas. Secondly, we\nintroduce a graphic constraint of saliency bounding box to further enhance the\nalignment of geometric features between layout representations and images. In\naddition, we adapt a transformer-based diffusion model as the backbone, whose\npowerful generation capability ensures the quality in layout generation.\nExtensive experimental results indicate that our method has achieved\nstate-of-the-art performance in both quantitative and qualitative evaluations.\nOur model framework can also be expanded to other graphic design fields.", 411 | "ArXiv Link": "https://arxiv.org/abs/2407.15233", 412 | "PDF Link": "https://arxiv.org/pdf/2407.15233", 413 | "Upvotes": "6" 414 | }, 415 | { 416 | "Title": "MIBench: Evaluating Multimodal Large Language Models over Multiple\n Images", 417 | "Abstract": "Built on the power of LLMs, numerous multimodal large language models (MLLMs)\nhave recently achieved remarkable performance on various vision-language tasks\nacross multiple benchmarks. However, most existing MLLMs and benchmarks\nprimarily focus on single-image input scenarios, leaving the performance of\nMLLMs when handling realistic multiple images remain underexplored. Although a\nfew benchmarks consider multiple images, their evaluation dimensions and\nsamples are very limited. Therefore, in this paper, we propose a new benchmark\nMIBench, to comprehensively evaluate fine-grained abilities of MLLMs in\nmulti-image scenarios. Specifically, MIBench categorizes the multi-image\nabilities into three scenarios: multi-image instruction (MII), multimodal\nknowledge-seeking (MKS) and multimodal in-context learning (MIC), and\nconstructs 13 tasks with a total of 13K annotated samples. During data\nconstruction, for MII and MKS, we extract correct options from manual\nannotations and create challenging distractors to obtain multiple-choice\nquestions. For MIC, to enable an in-depth evaluation, we set four sub-tasks and\ntransform the original datasets into in-context learning formats. We evaluate\nseveral open-source MLLMs and close-source MLLMs on the proposed MIBench. The\nresults reveal that although current models excel in single-image tasks, they\nexhibit significant shortcomings when faced with multi-image inputs, such as\nconfused fine-grained perception, limited multi-image reasoning, and unstable\nin-context learning. The annotated data in MIBench is available at\nhttps://huggingface.co/datasets/StarBottle/MIBench.", 418 | "ArXiv Link": "https://arxiv.org/abs/2407.15272", 419 | "PDF Link": "https://arxiv.org/pdf/2407.15272", 420 | "Upvotes": "6" 421 | }, 422 | { 423 | "Title": "Discrete Flow Matching", 424 | "Abstract": "Despite Flow Matching and diffusion models having emerged as powerful\ngenerative paradigms for continuous variables such as images and videos, their\napplication to high-dimensional discrete data, such as language, is still\nlimited. In this work, we present Discrete Flow Matching, a novel discrete flow\nparadigm designed specifically for generating discrete data. Discrete Flow\nMatching offers several key contributions: (i) it works with a general family\nof probability paths interpolating between source and target distributions;\n(ii) it allows for a generic formula for sampling from these probability paths\nusing learned posteriors such as the probability denoiser (x-prediction) and\nnoise-prediction (epsilon-prediction); (iii) practically, focusing on\nspecific probability paths defined with different schedulers considerably\nimproves generative perplexity compared to previous discrete diffusion and flow\nmodels; and (iv) by scaling Discrete Flow Matching models up to 1.7B\nparameters, we reach 6.7% Pass@1 and 13.4% Pass@10 on HumanEval and 6.7% Pass@1\nand 20.6% Pass@10 on 1-shot MBPP coding benchmarks. Our approach is capable of\ngenerating high-quality discrete data in a non-autoregressive fashion,\nsignificantly closing the gap between autoregressive models and discrete flow\nmodels.", 425 | "ArXiv Link": "https://arxiv.org/abs/2407.15595", 426 | "PDF Link": "https://arxiv.org/pdf/2407.15595", 427 | "Upvotes": "5" 428 | }, 429 | { 430 | "Title": "Local All-Pair Correspondence for Point Tracking", 431 | "Abstract": "We introduce LocoTrack, a highly accurate and efficient model designed for\nthe task of tracking any point (TAP) across video sequences. Previous\napproaches in this task often rely on local 2D correlation maps to establish\ncorrespondences from a point in the query image to a local region in the target\nimage, which often struggle with homogeneous regions or repetitive features,\nleading to matching ambiguities. LocoTrack overcomes this challenge with a\nnovel approach that utilizes all-pair correspondences across regions, i.e.,\nlocal 4D correlation, to establish precise correspondences, with bidirectional\ncorrespondence and matching smoothness significantly enhancing robustness\nagainst ambiguities. We also incorporate a lightweight correlation encoder to\nenhance computational efficiency, and a compact Transformer architecture to\nintegrate long-term temporal information. LocoTrack achieves unmatched accuracy\non all TAP-Vid benchmarks and operates at a speed almost 6 times faster than\nthe current state-of-the-art.", 432 | "ArXiv Link": "https://arxiv.org/abs/2407.15420", 433 | "PDF Link": "https://arxiv.org/pdf/2407.15420", 434 | "Upvotes": "5" 435 | }, 436 | { 437 | "Title": "ThermalNeRF: Thermal Radiance Fields", 438 | "Abstract": "Thermal imaging has a variety of applications, from agricultural monitoring\nto building inspection to imaging under poor visibility, such as in low light,\nfog, and rain. However, reconstructing thermal scenes in 3D presents several\nchallenges due to the comparatively lower resolution and limited features\npresent in long-wave infrared (LWIR) images. To overcome these challenges, we\npropose a unified framework for scene reconstruction from a set of LWIR and RGB\nimages, using a multispectral radiance field to represent a scene viewed by\nboth visible and infrared cameras, thus leveraging information across both\nspectra. We calibrate the RGB and infrared cameras with respect to each other,\nas a preprocessing step using a simple calibration target. We demonstrate our\nmethod on real-world sets of RGB and LWIR photographs captured from a handheld\nthermal camera, showing the effectiveness of our method at scene representation\nacross the visible and infrared spectra. We show that our method is capable of\nthermal super-resolution, as well as visually removing obstacles to reveal\nobjects that are occluded in either the RGB or thermal channels. Please see\nhttps://yvette256.github.io/thermalnerf for video results as well as our code\nand dataset release.", 439 | "ArXiv Link": "https://arxiv.org/abs/2407.15337", 440 | "PDF Link": "https://arxiv.org/pdf/2407.15337", 441 | "Upvotes": "5" 442 | }, 443 | { 444 | "Title": "Temporal Residual Jacobians For Rig-free Motion Transfer", 445 | "Abstract": "We introduce Temporal Residual Jacobians as a novel representation to enable\ndata-driven motion transfer. Our approach does not assume access to any rigging\nor intermediate shape keyframes, produces geometrically and temporally\nconsistent motions, and can be used to transfer long motion sequences. Central\nto our approach are two coupled neural networks that individually predict local\ngeometric and temporal changes that are subsequently integrated, spatially and\ntemporally, to produce the final animated meshes. The two networks are jointly\ntrained, complement each other in producing spatial and temporal signals, and\nare supervised directly with 3D positional information. During inference, in\nthe absence of keyframes, our method essentially solves a motion extrapolation\nproblem. We test our setup on diverse meshes (synthetic and scanned shapes) to\ndemonstrate its superiority in generating realistic and natural-looking\nanimations on unseen body shapes against SoTA alternatives. Supplemental video\nand code are available at https://temporaljacobians.github.io/ .", 446 | "ArXiv Link": "https://arxiv.org/abs/2407.14958", 447 | "PDF Link": "https://arxiv.org/pdf/2407.14958", 448 | "Upvotes": "5" 449 | }, 450 | { 451 | "Title": "GET-Zero: Graph Embodiment Transformer for Zero-shot Embodiment\n Generalization", 452 | "Abstract": "This paper introduces GET-Zero, a model architecture and training procedure\nfor learning an embodiment-aware control policy that can immediately adapt to\nnew hardware changes without retraining. To do so, we present Graph Embodiment\nTransformer (GET), a transformer model that leverages the embodiment graph\nconnectivity as a learned structural bias in the attention mechanism. We use\nbehavior cloning to distill demonstration data from embodiment-specific expert\npolicies into an embodiment-aware GET model that conditions on the hardware\nconfiguration of the robot to make control decisions. We conduct a case study\non a dexterous in-hand object rotation task using different configurations of a\nfour-fingered robot hand with joints removed and with link length extensions.\nUsing the GET model along with a self-modeling loss enables GET-Zero to\nzero-shot generalize to unseen variation in graph structure and link length,\nyielding a 20% improvement over baseline methods. All code and qualitative\nvideo results are on https://get-zero-paper.github.io", 453 | "ArXiv Link": "https://arxiv.org/abs/2407.15002", 454 | "PDF Link": "https://arxiv.org/pdf/2407.15002", 455 | "Upvotes": "4" 456 | }, 457 | { 458 | "Title": "Visual Haystacks: Answering Harder Questions About Sets of Images", 459 | "Abstract": "Recent advancements in Large Multimodal Models (LMMs) have made significant\nprogress in the field of single-image visual question answering. However, these\nmodels face substantial challenges when tasked with queries that span extensive\ncollections of images, similar to real-world scenarios like searching through\nlarge photo albums, finding specific information across the internet, or\nmonitoring environmental changes through satellite imagery. This paper explores\nthe task of Multi-Image Visual Question Answering (MIQA): given a large set of\nimages and a natural language query, the task is to generate a relevant and\ngrounded response. We propose a new public benchmark, dubbed \"Visual Haystacks\n(VHs),\" specifically designed to evaluate LMMs' capabilities in visual\nretrieval and reasoning over sets of unrelated images, where we perform\ncomprehensive evaluations demonstrating that even robust closed-source models\nstruggle significantly. Towards addressing these shortcomings, we introduce\nMIRAGE (Multi-Image Retrieval Augmented Generation), a novel retrieval/QA\nframework tailored for LMMs that confronts the challenges of MIQA with marked\nefficiency and accuracy improvements over baseline methods. Our evaluation\nshows that MIRAGE surpasses closed-source GPT-4o models by up to 11% on the VHs\nbenchmark and offers up to 3.4x improvements in efficiency over text-focused\nmulti-stage approaches.", 460 | "ArXiv Link": "https://arxiv.org/abs/2407.13766", 461 | "PDF Link": "https://arxiv.org/pdf/2407.13766", 462 | "Upvotes": "2" 463 | } 464 | ] -------------------------------------------------------------------------------- /unique_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # 从本地读取JSON文件 4 | with open('extracted_data.json', 'r', encoding='utf-8') as file: 5 | data = json.load(file) 6 | 7 | # 使用字典来去除重复项 8 | unique_data = {} 9 | for entry in data: 10 | arxiv_link = entry['ArXiv Link'] 11 | if arxiv_link not in unique_data: 12 | unique_data[arxiv_link] = entry 13 | 14 | # 转换为列表 15 | unique_data_list = list(unique_data.values()) 16 | 17 | # 将去重后的数据写回到一个新的JSON文件 18 | with open('unique_data.json', 'w', encoding='utf-8') as file: 19 | json.dump(unique_data_list, file, indent=4) 20 | 21 | print("去重后的数据已保存到 'unique_data.json' 文件中") 22 | -------------------------------------------------------------------------------- /write_to_html.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from jinja2 import Environment, FileSystemLoader 3 | import json 4 | from openai import OpenAI 5 | import time 6 | from config import API_KEY, BASE_URL 7 | 8 | client = OpenAI( 9 | api_key=API_KEY, 10 | base_url=BASE_URL, 11 | ) 12 | 13 | def askLLM(message, retries=10, delay=8): 14 | """ 15 | 发送消息给LLM，如果失败则等待一段时间后重试。 16 | 17 | :param message: 发送到LLM的消息列表 18 | :param retries: 重试次数，默认为3次 19 | :param delay: 重试之间的延迟时间，单位为秒，默认为2秒 20 | :return: LLM的响应内容或者在所有重试失败后返回None 21 | """ 22 | for attempt in range(retries): 23 | try: 24 | response = client.chat.completions.create( 25 | model="moonshot-v1-128k", 26 | temperature=0.7, 27 | max_tokens=2000, 28 | messages=message, 29 | ) 30 | # 检查response是否包含所需的数据 31 | if response.choices and response.choices[0].message.content: 32 | return response.choices[0].message.content 33 | else: 34 | raise ValueError("Response from LLM is missing content.") 35 | except Exception as e: 36 | print(f"Attempt {attempt + 1} failed with error: {e}") 37 | if attempt < retries - 1: 38 | print(f"Waiting {delay} seconds before retrying...") 39 | time.sleep(delay) 40 | else: 41 | print("Max retries reached. No response received from LLM.") 42 | return None 43 | 44 | # 读取 JSON 数据 45 | with open('articles_summary.json', 'r', encoding='utf-8') as f: 46 | data = json.load(f) 47 | 48 | content = str(data) 49 | content = content[:50000] 50 | 51 | message_1 =[ 52 | {"role": "system", "content": "根据Json中的本周AI论文信息，通俗幽默地用连续100字以内分点介绍本周论文看点摘要，不需要每篇论文都介绍，只节选部分有趣的内容作为看点,使用中文。" 53 | "尽可能避免使用专业词汇，用通俗易懂的语言进行代替。充分使用不同的emoji表情。例如（🔬,🥇，🎉，🎮️等emoji）"}, 54 | {"role": "user", "content": f"根据Json中的本周AI论文信息，通俗幽默地用一段100字文字分点总结出本周论文看点，不需要每篇论文都介绍，精选部分有趣的作为看点,使用中文，充分使用不同的emoji表情。Json内容：{content}。" 55 | f"Output format（100字以内！精选3～5篇论文即可）:" 56 | f"本周AI论文看点如下：" 57 | f"[emoji1] ..." 58 | f"[emoji2] ..." 59 | f"[emoji3] ..." 60 | f"......" 61 | f"更多关于本周论文的详细信息，让我们接着看下去吧～"}, 62 | ] 63 | 64 | 65 | # 设置 Jinja2 模板环境 66 | env = Environment(loader=FileSystemLoader(''), autoescape=True) 67 | 68 | # 添加单独的字段到渲染上下文中 69 | summary = askLLM(message_1) 70 | 71 | # 获取当前日期 72 | current_date = datetime.now().date() 73 | 74 | # 将日期格式化为字符串 75 | time = current_date.strftime('%Y-%m-%d') 76 | 77 | # 加载 HTML 模板 78 | template = env.get_template('news_template.html') 79 | 80 | # 渲染 HTML 模板，并将额外字段传递给模板 81 | output = template.render(articles=data, summary=summary, time=time) 82 | 83 | # 将渲染结果写入 HTML 文件 84 | with open('output.html', 'w', encoding='utf-8') as f: 85 | f.write(output) 86 | 87 | # 群公告文案 88 | message_2 =[ 89 | {"role": "system", "content": "根据本周AI论文信息，通俗幽默地用分点的方式重新排版输出，并按照规定格式，使用丰富的emoji表情。"}, 90 | {"role": "user", "content": f"""根据提供的周报摘要以及输出模板，整理出本周周报信息。使用丰富的emoji表情 91 | 周报摘要：{summary} 92 | 输出格式： 93 | 本周AI论文关键词：xxx xxx xxx 94 | 特别看点 95 | ： 96 | 1 97 | 2 98 | 3"""}, 99 | ] 100 | 101 | notice = askLLM(message_2) 102 | 103 | notice = f"🔥 {time} 特工宇宙AI论文周报\n\n" + notice 104 | 105 | print(notice) 106 | --------------------------------------------------------------------------------

目录

机器学习 (ML)

{{ article.title }}

深度学习 (DL)

{{ article.title }}

自然语言处理 (NLP)

{{ article.title }}

计算机视觉 (CV)

{{ article.title }}

智能系统和应用 (ISA)

{{ article.title }}

AI Weekly Papers

特工宇宙 x Kimi AI论文简报

{{ time }} 论文导读

特工宇宙AI论文简报

{{ time }} 论文导读

{{ article.title }}