|
168 | 168 | }, |
169 | 169 | "thumbnail": "/sam2act.png" |
170 | 170 | }, |
| 171 | + { |
| 172 | + "title": "Manipulate-Anything: Automating Real-World Robots using Vision-Language Models", |
| 173 | + "authors": [ |
| 174 | + "Jiafei Duan", |
| 175 | + "Wentao Yuan", |
| 176 | + "Wilbert Pumacay", |
| 177 | + "Yi Ru Wang", |
| 178 | + "Kiana Ehsani", |
| 179 | + "Dieter Fox", |
| 180 | + "Ranjay Krishna" |
| 181 | + ], |
| 182 | + "year": "2024", |
| 183 | + "venue": "CoRL 2024", |
| 184 | + "links": { |
| 185 | + "pdf": "https://arxiv.org/abs/2406.18915" |
| 186 | + }, |
| 187 | + "thumbnail": "/manipulateanything.png" |
| 188 | + }, |
| 189 | + { |
| 190 | + "title": "EVE: Enabling Anyone to Train Robots using Augmented Reality", |
| 191 | + "authors": [ |
| 192 | + "Jun Wang", |
| 193 | + "Chun-Cheng Chang", |
| 194 | + "Jiafei Duan", |
| 195 | + "Dieter Fox", |
| 196 | + "Ranjay Krishna" |
| 197 | + ], |
| 198 | + "year": "2024", |
| 199 | + "venue": "UIST 2024", |
| 200 | + "links": { |
| 201 | + "pdf": "https://arxiv.org/abs/2404.06089" |
| 202 | + }, |
| 203 | + "thumbnail": "/eve.png" |
| 204 | + }, |
| 205 | + { |
| 206 | + "title": "Videoshop: Localized Semantic Video Editing with Noise-Extrapolated Diffusion Inversion", |
| 207 | + "authors": [ |
| 208 | + "Xiang Fan", |
| 209 | + "Anand Bhattad", |
| 210 | + "Ranjay Krishna" |
| 211 | + ], |
| 212 | + "year": "2024", |
| 213 | + "venue": "ECCV 2024", |
| 214 | + "links": { |
| 215 | + "pdf": "https://arxiv.org/abs/2403.14617" |
| 216 | + }, |
| 217 | + "thumbnail": "/videoshop.png" |
| 218 | + }, |
| 219 | + { |
| 220 | + "title": "Found in the middle: Calibrating Positional Attention Bias Improves Long Context Utilization", |
| 221 | + "authors": [ |
| 222 | + "Cheng-Yu Hsieh", |
| 223 | + "Yung-Sung Chuang", |
| 224 | + "Chun-Liang Li", |
| 225 | + "Zifeng Wang", |
| 226 | + "Long Le", |
| 227 | + "Abhishek Kumar", |
| 228 | + "James R. Glass", |
| 229 | + "Alexander Ratner", |
| 230 | + "Chen-Yu Lee", |
| 231 | + "Ranjay Krishna", |
| 232 | + "Tomas Pfister" |
| 233 | + ], |
| 234 | + "year": "2024", |
| 235 | + "venue": "ACL Findings 2024", |
| 236 | + "links": { |
| 237 | + "pdf": "https://arxiv.org/abs/2406.16008" |
| 238 | + }, |
| 239 | + "thumbnail": "/foundinthemiddle.png" |
| 240 | + }, |
| 241 | + { |
| 242 | + "title": "Iterated Learning Improves Compositionality in Large Vision-Language Models", |
| 243 | + "authors": [ |
| 244 | + "Chenhao Zheng", |
| 245 | + "Jieyu Zhang", |
| 246 | + "Aniruddha Kembhavi", |
| 247 | + "Ranjay Krishna" |
| 248 | + ], |
| 249 | + "year": "2024", |
| 250 | + "venue": "CVPR 2024", |
| 251 | + "links": { |
| 252 | + "pdf": "https://arxiv.org/abs/2404.02145" |
| 253 | + }, |
| 254 | + "thumbnail": "/iteratedlearning.png" |
| 255 | + }, |
| 256 | + { |
| 257 | + "title": "The Hard Positive Truth about Vision-Language Compositionality", |
| 258 | + "authors": [ |
| 259 | + "Amita Kamath", |
| 260 | + "Cheng-Yu Hsieh", |
| 261 | + "Kai-Wei Chang", |
| 262 | + "Ranjay Krishna" |
| 263 | + ], |
| 264 | + "year": "2024", |
| 265 | + "venue": "ECCV 2024", |
| 266 | + "links": { |
| 267 | + "pdf": "https://arxiv.org/abs/2409.17958" |
| 268 | + }, |
| 269 | + "thumbnail": "/hardpositivetruth.png" |
| 270 | + }, |
| 271 | + { |
| 272 | + "title": "m&m's: A Benchmark to Evaluate Tool-Use for multi-step multi-modal Tasks", |
| 273 | + "authors": [ |
| 274 | + "Zixian Ma", |
| 275 | + "Weikai Huang", |
| 276 | + "Jieyu Zhang", |
| 277 | + "Tanmay Gupta", |
| 278 | + "Ranjay Krishna" |
| 279 | + ], |
| 280 | + "year": "2024", |
| 281 | + "venue": "ECCV 2024", |
| 282 | + "links": { |
| 283 | + "pdf": "https://arxiv.org/abs/2403.11085" |
| 284 | + }, |
| 285 | + "thumbnail": "/mnms.png" |
| 286 | + }, |
171 | 287 | { |
172 | 288 | "title": "Unsettling the Hegemony of Intention: Agonistic Image Generation", |
173 | 289 | "authors": [ |
|
0 commit comments