Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via Editable Gaussian Splatting

@misc{shorinwa_splat-mover_2024,
  title = {Splat-{MOVER}: {Multi}-{Stage}, {Open}-{Vocabulary} {Robotic} {Manipulation} via {Editable} {Gaussian} {Splatting}},
  shorttitle = {Splat-{MOVER}},
  url = {http://arxiv.org/abs/2405.04378},
  abstract = {We present Splat-MOVER, a modular robotics stack for open-vocabulary robotic manipulation, which leverages the editability of Gaussian Splatting (GSplat) scene representations to enable multi-stage manipulation tasks. Splat-MOVER consists of: (i) ASK-Splat, a GSplat representation that distills semantic and grasp affordance features into the 3D scene. ASK-Splat enables geometric, semantic, and affordance understanding of 3D scenes, which is critical in many robotics tasks; (ii) SEE-Splat, a real-time scene-editing module using 3D semantic masking and infilling to visualize the motions of objects that result from robot interactions in the real-world. SEE-Splat creates a "digital twin" of the evolving environment throughout the manipulation task; and (iii) Grasp-Splat, a grasp generation module that uses ASK-Splat and SEE-Splat to propose affordance-aligned candidate grasps for open-world objects. ASK-Splat is trained in real-time from RGB images in a brief scanning phase prior to operation, while SEE-Splat and Grasp-Splat run in real-time during operation. We demonstrate the superior performance of Splat-MOVER in hardware experiments on a Kinova robot compared to two recent baselines in four single-stage, open-vocabulary manipulation tasks and in four multi-stage manipulation tasks, using the edited scene to reflect changes due to prior manipulation stages, which is not possible with existing baselines. The project page is available at https://splatmover.github.io, and the code for the project will be made available after review.},
  urldate = {2024-07-17},
  publisher = {arXiv},
  author = {Shorinwa, Ola and Tucker, Johnathan and Smith, Aliyah and Swann, Aiden and Chen, Timothy and Firoozi, Roya and Kennedy III, Monroe and Schwager, Mac},
  month = nov,
  year = {2024},
  keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Robotics, semantic\_splats},
  month_numeric = {11}
}