index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="DESCRIPTION META TAG">
  <meta property="og:title" content="SOCIAL MEDIA TITLE TAG"/>
  <meta property="og:description" content="SOCIAL MEDIA DESCRIPTION TAG TAG"/>
  <meta property="og:url" content="URL OF THE WEBSITE"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/image/your_banner_image.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="TWITTER BANNER TITLE META TAG">
  <meta name="twitter:description" content="TWITTER BANNER DESCRIPTION META TAG">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/your_twitter_banner_image.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="KEYWORDS SHOULD BE PLACED HERE">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Co-MTP: A Cooperative Trajectory Prediction Framework with Multi-Temporal Fusion for Autonomous Driving</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>

  <style>
    .custom-container {
      max-width: 80%; /* 修改最大宽度为页面宽度的80% */
    }
  </style>

<style>
  .full-container {
    max-width: 100%; /* 最大宽度为页面宽度的100% */
  }
</style>

  <section class="hero">
    <div class="hero-body">
      <div class="container full-container">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Co-MTP: A Cooperative Trajectory Prediction Framework with Multi-Temporal Fusion for Autonomous Driving</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="FIRST AUTHOR PERSONAL LINK" target="_blank" rel="noopener noreferrer">Xinyu Zhang</a><sup>*</sup>,</span>
                <span class="author-block">
                  <a href="https://zewei-zhou.github.io/" target="_blank" rel="noopener noreferrer">Zewei Zhou</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="https://wi11ione.github.io/" target="_blank" rel="noopener noreferrer">Zhaoyi Wang</a>,</span>
                    <span class="author-block">
                      <a href="FOURTH AUTHOR PERSONAL LINK" target="_blank" rel="noopener noreferrer">Yangjie Ji</a>,</span>
                      <span class="author-block">
                        <a href="https://www.linkedin.com/in/yanjun-huang-46099b82/" target="_blank" rel="noopener noreferrer">Yanjun Huang</a>,</span>
                        <span class="author-block">
                          <a href="https://scholar.google.com/citations?hl=zh-CN&user=n_eA148AAAAJ" target="_blank" rel="noopener noreferrer">Hong Chen</a>
                        </span>
                        </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">Tongji University</span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                  </div>

                  <div class="has-text-centered" style="margin-top: 10px;">
                    <strong class="is-size-4", style="color: red;">ICRA 2025</strong>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="static/pdfs/ICRA_cooperative_prediction.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                    <!-- Supplementary PDF link -->
                    <!-- <span class="link-block">
                      <a href="static/pdfs/supplementary_material.pdf" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                      </span>
                      <span>Supplementary</span>
                    </a>
                  </span> -->

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/xiaomiaozhang/Co-MTP" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2502.16589" target="_blank" rel="noopener noreferrer" class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <span class="ai ai-arxiv"></span>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Teaser video-->
<!-- <section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" autoplay controls muted loop height="100%"> -->
        <!-- Your video here -->
        <!-- <source src="static/videos/banner_video.mp4"
        type="video/mp4">
      </video>
      <h2 class="subtitle has-text-centered">
        Aliquam vitae elit ullamcorper tellus egestas pellentesque. Ut lacus tellus, maximus vel lectus at, placerat pretium mi. Maecenas dignissim tincidunt vestibulum. Sed consequat hendrerit nisl ut maximus. 
      </h2>
    </div>
  </div>
</section> -->
<!-- End teaser video -->


<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container custom-container">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Vehicle-to-everything technologies (V2X) have become an ideal paradigm to extend the perception range and see through the occlusion. Exiting efforts focus on single-frame cooperative perception, however, how to capture the temporal cue between frames with V2X to facilitate the prediction task even the planning task is still underexplored. In this paper, we introduce the Co-MTP, a general cooperative trajectory prediction framework with multi-temporal fusion for autonomous driving, which leverages the V2X system to fully capture the interaction among agents in both history and future domains to benefit the planning. In the history domain, V2X can complement the incomplete history trajectory in single-vehicle perception, and we design a heterogeneous graph transformer to learn the fusion of the history feature from multiple agents and capture the history interaction. Moreover, the goal of prediction is to support future planning. Thus, in the future domain, V2X can provide the prediction results of surrounding objects, and we further extend the graph transformer to capture the future interaction among the ego planning and the other vehicles' intentions and obtain the final future scenario state under a certain planning action. We evaluate the Co-MTP framework on the real-world dataset V2X-Seq, and the results show that Co-MTP achieves state-of-the-art performance and that both history and future fusion can greatly benefit prediction.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

<!-- Method -->
<!-- <section class="hero teaser">
  <div class="container full-container">
    <div class="column is-four-fifths">
      <h2 class="title is-3">Co-MTP Framework</h2>
      <div class="content has-text-justified">
        <img src="static/images/overview.jpg" alt="MY ALT TEXT"/>
        <p>
         The overall architecture of Co-MTP. In this framework, infrastructures share the history and their prediction results to ego CAV. Then, we construct a heterogeneous scene graph with the processed trajectory data and map information, categorizing them according to the types of objects and map elements. Next, we initialize the features of nodes and edges in the relative coordinate system of each object. The CTCA Fusion is used to update the features of the nodes and edges selected by the STSA module over K Transformer layers. Finally, we take the nodes' hidden features from the last layer and input them into the Multimodal Decoder to obtain the multimodal trajectory prediction results.
        <p>
      </div>
    </div>  
  </div>
</section> -->
<!-- End Method -->

<style>
  /* 保证整个模块居中 */
  .full-container {
    display: flex;
    justify-content: center; /* 水平居中 */
    align-items: center; /* 垂直居中 */
  }

  /* 设置内容列的最大宽度和位置 */
  .column.is-four-fifths {
    width: 80%; /* 可以根据需要调整列的宽度 */
  }

  /* 设置标题居左 */
  .title.is-4 {
    text-align: left; /* 使标题左对齐 */
  }

  /* 图片居中 */
  .content img {
    display: block;
    margin: 0 auto; /* 使图片居中 */
  }

  /* 文字居中并且文本两端对齐 */
  .content.has-text-justified {
    text-align: justify;
  }
</style>

<!-- Method -->
<section class="hero teaser">
  <div class="container full-container">
    <div class="column is-four-fifths">
      <h2 class="title is-4">Co-MTP Framework</h2>
      <div class="content has-text-justified">
        <img src="static/images/overview.jpg" alt="MY ALT TEXT"/>
        <p>
         <strong>The overall architecture of Co-MTP.</strong> In this framework, infrastructures share the history and their prediction results to ego CAV. Then, we construct a heterogeneous scene graph with the processed trajectory data and map information, categorizing them according to the types of objects and map elements. Next, we initialize the features of nodes and edges in the relative coordinate system of each object. The CTCA Fusion is used to update the features of the nodes and edges selected by the STSA module over K Transformer layers. Finally, we take the nodes' hidden features from the last layer and input them into the Multimodal Decoder to obtain the multimodal trajectory prediction results.
        </p>
      </div>
    </div>  
  </div>
</section>
<!-- End Method -->

<!-- Experiment -->
<section class="hero teaser">
  <div class="container full-container">
    <div class="column is-four-fifths">
      <h2 class="title is-4">Experiment</h2>
      <div class="content has-text-justified">
        <div class="has-text-centered">
          <img src="static/images/main_result.png" alt="Description of the new image" class="new-image_1"/>
          <p style="text-align: justify;">
            <strong>Performance comparison on the V2X-Seq dataset.</strong> TNT, HiVT and V2X-Graph are existing methods on the V2X-Seq dataset. Co-HTTP is the baseline model, simplified from our Co-MTP model. The framework Co-MTP ranks first across minADE/minFDE/MR in the benchmark of the dataset.
          </p>
        </div>

        <!-- 增加空行 -->
        <br> 

        <div class="has-text-centered">
          <img src="static/images/ablation_study.png" alt="Description of the new image" class="new-image_2"/>
          <p style="text-align: justify;">
            <strong>Results of model ablation study.</strong> We examine the effectiveness of multiview data processing strategies and the decoder, assessing Co-MTP variations separately in history and future time dimensions.
          </p>
        </div>
        
        <!-- 增加空行 -->
        <br> 

        <div class="has-text-centered">
          <!-- 图片容器 -->
          <div style="display: flex; justify-content: center; gap: 20px;">
            <img src="static/images/noise.png" alt="Description of image 1" style="max-width: 47%;"/>
            <img src="static/images/time_delay.png" alt="Description of image 2" style="max-width: 47%;"/>
          </div>
          <p style="text-align: justify; margin-top: 0px;">
            <strong>Robustness assessment.</strong> We conduct robustness assessments by introducing noise and communication delays, assuming a positional deviation of 0.2 meters and a time delay of 0.5 seconds. We design experiments using the same Co-MTP model base, alongside two variants: Co-MTP-no fusion, which excludes the future fusion, and Co-HTTP-nofut, which simply stitches the trajectory without future information.
          </p>
        </div>
        <div class="gif-container">
          <!-- 每个 GIF 和标题的容器 -->
          <div class="gif-item">
            <img src="static/images/visualize_pra_1001.gif" alt="MY ALT TEXT" class="gif-image"/>
            <p class="gif-caption">(a)</p>
          </div>
          <div class="gif-item">
            <img src="static/images/visualize_pra_10025.gif" alt="MY ALT TEXT" class="gif-image"/>
            <p class="gif-caption">(b)</p>
          </div>
          <div class="gif-item">
            <img src="static/images/visualize_pra_10058.gif" alt="MY ALT TEXT" class="gif-image"/>
            <p class="gif-caption">(c)</p>
          </div>
          <div class="gif-item">
            <img src="static/images/visualize_pra_10078.gif" alt="MY ALT TEXT" class="gif-image"/>
            <p class="gif-caption">(d)</p>
          </div>
          <p>
            <strong>Qualitative examples of Co-MTP on V2X-Seq dataset.</strong> The red box are AV, while the orange ones are the predicted targets. The history ground-truth are shown in blue, the predicted trajectories are shown in green, , and the future ground-truth are shown in brown.
          <p>
        </div>
        <p>
          <!-- 其他内容 -->
        </p>
      </div>
    </div>  
  </div>
</section>
<!-- End Experiment -->

<style>
  .gif-container {
    display: flex;
    flex-wrap: wrap;
    gap: 0px; /* 调整 GIF 之间的间距 */
  }

  .new-image_1 {
  max-width: 53%; /* 图片最大宽度 */
  margin: 20px 0; /* 上下边距 */
}

.new-image_2 {
  max-width: 100%; /* 图片最大宽度 */
  margin: 20px 0; /* 上下边距 */
}

  .gif-item {
    flex: 1 1 calc(50% - 0px); /* 每行两个 GIF，减去间距 */
    max-width: calc(50% - 0px); /* 确保每行只有两个 GIF */
    text-align: center; /* 标题居中 */
  }

  .gif-image {
    width: 100%; /* 让 GIF 填满容器宽度 */
    height: auto; /* 保持比例 */
    max-height: 600px; /* 设置最大高度，根据需要调整 */
  }

  .gif-caption {
    margin-top: 0px; /* 标题与 GIF 的间距 */
    font-size: 14px; /* 标题字体大小 */
    color: #333; /* 标题颜色 */
  }
</style>

<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@misc{zhang2025comtpcooperativetrajectoryprediction,
      title={Co-MTP: A Cooperative Trajectory Prediction Framework with Multi-Temporal Fusion for Autonomous Driving}, 
      author={Xinyu Zhang and Zewei Zhou and Zhaoyi Wang and Yangjie Ji and Yanjun Huang and Hong Chen},
      year={2025},
      eprint={2502.16589},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2502.16589}, 
}</code></pre>
    </div>
</section>
<!--End BibTex citation -->


<!--   <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the source code of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer> -->

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>