dSB_GPU/MultiRowSharedFused.py at main · yaochengchen/dSB_GPU · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
"""
支持N>1024的共享内存版本：每个线程处理多个row
"""

import torch
import time
from torch.utils.cpp_extension import load_inline

# ⭐ 支持大N的CUDA源代码：每个线程处理多个row
cuda_source_multi_row_shared = '''
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// ⭐ 支持大N：每个线程处理多个row的共享内存版本
__global__ void __launch_bounds__(1024, 2)
fused_dynamics_xy_shared_multi_row_kernel(
    float* y,                   // 输入输出: y 数组 (N, batch_size)
    float* x,                   // 输入输出: x 数组 (N, batch_size)
    const float delta,          // 标量参数
    const float* p_array,       // p 数组 (n_iterations,)
    const float xi,             // 标量参数
    const float dt,             // 时间步长
    const int* J_indices,       // CSR格式稀疏矩阵的列索引
    const float* J_values,      // CSR格式稀疏矩阵的值
    const int* J_crow_ptr,      // CSR格式稀疏矩阵的行指针
    const int N,                // 矩阵维度
    const int batch_size,       // batch大小
    const int n_iterations      // 迭代次数
) {
    // 每个block处理一个batch
    int batch = blockIdx.y;
    int thread_id = threadIdx.x;

    if (batch >= batch_size) return;

    // ⭐ 共享内存：存储完整的x和y向量
    extern __shared__ float shared_data[];
    float* shared_x = shared_data;           // N个float
    float* shared_y = shared_data + N;       // N个float

    // ⭐ 每个线程负责的row数量
    int rows_per_thread = (N + blockDim.x - 1) / blockDim.x;

    // ⭐ 加载数据到共享内存：每个线程处理多个row
    for (int i = 0; i < rows_per_thread; i++) {
        int row = thread_id + i * blockDim.x;
        if (row < N) {
            int global_idx = row * batch_size + batch;
            shared_x[row] = x[global_idx];
            shared_y[row] = y[global_idx];
        }
    }
    __syncthreads();

    // ⭐ 迭代计算
    for (int iter = 0; iter < n_iterations; iter++) {
        float p_i = p_array[iter];

        // ⭐ 每个线程处理它负责的所有row
        for (int i = 0; i < rows_per_thread; i++) {
            int row = thread_id + i * blockDim.x;
            if (row < N) {
                float x_val = shared_x[row];
                float y_val = shared_y[row];

                // 稀疏矩阵乘法：从共享内存读取x数据
                float sparse_result = 0.0f;
                int start = J_crow_ptr[row];
                int end = J_crow_ptr[row + 1];

                for (int j = start; j < end; j++) {
                    int col_idx = J_indices[j];
                    float j_value = J_values[j];

                    if (col_idx < N) {
                        float col_x = shared_x[col_idx];  // 从共享内存读取
                        float col_sign = (col_x > 0.0f) ? 1.0f : ((col_x < 0.0f) ? -1.0f : 0.0f);
                        sparse_result += j_value * col_sign;
                    }
                }

                // 更新计算
                float term1 = -(delta - p_i) * x_val;
                float term2 = xi * sparse_result;
                y_val += (term1 + term2) * dt;
                x_val += dt * y_val * delta;

                // 条件截断
                if (fabsf(x_val) > 1.0f) {
                    x_val = (x_val > 0.0f) ? 1.0f : -1.0f;
                    y_val = 0.0f;
                }

                // 更新共享内存中的值
                shared_x[row] = x_val;
                shared_y[row] = y_val;
            }
        }

        __syncthreads();  // 确保所有线程都完成了这一轮迭代
    }

    // ⭐ 写回全局内存：每个线程处理多个row
    for (int i = 0; i < rows_per_thread; i++) {
        int row = thread_id + i * blockDim.x;
        if (row < N) {
            int global_idx = row * batch_size + batch;
            x[global_idx] = shared_x[row];
            y[global_idx] = shared_y[row];
        }
    }
}

// 🔄 保留原来的fallback kernel
__global__ void fused_dynamics_fallback_iterations_kernel(
    float* y, float* x, const float delta, const float* p_array,
    float xi, float dt, const int* J_indices,
    const float* J_values, const int* J_crow_ptr,
    const int N, const int batch_size, const int n_iterations
) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int batch = blockIdx.y * blockDim.y + threadIdx.y;

    if (row >= N || batch >= batch_size) return;

    int idx = row * batch_size + batch;
    float x_val = x[idx];
    float y_val = y[idx];

    for (int iter = 0; iter < n_iterations; iter++) {
        float p_i = p_array[iter];

        float sparse_result = 0.0f;
        int start = J_crow_ptr[row];
        int end = J_crow_ptr[row + 1];

        for (int j = start; j < end; j++) {
            int col_idx = J_indices[j];
            float col_x = x[col_idx * batch_size + batch];
            float col_sign = (col_x > 0.0f) ? 1.0f : ((col_x < 0.0f) ? -1.0f : 0.0f);
            sparse_result += J_values[j] * col_sign;
        }

        float term1 = -(delta - p_i) * x_val;
        float term2 = xi * sparse_result;
        y_val += (term1 + term2) * dt;
        x_val += dt * y_val * delta;

        if (fabsf(x_val) > 1.0f) {
            x_val = (x_val > 0.0f) ? 1.0f : -1.0f;
            y_val = 0.0f;
        }

        x[idx] = x_val;
        y[idx] = y_val;

        __syncthreads();
    }
}

// ⭐ 检查多行共享内存支持（支持任意大的N）
bool check_multi_row_shared_memory_support(int N, int* max_threads_per_block) {
    int device;
    cudaGetDevice(&device);
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);

    size_t shared_mem_needed = 2 * N * sizeof(float);  // x和y
    *max_threads_per_block = prop.maxThreadsPerBlock;  // 通常是1024

    return shared_mem_needed <= prop.sharedMemPerBlock;
}

// ⭐ 启动多行共享内存版本
bool launch_multi_row_shared_kernel_wrapper(
    float* y, float* x, float delta, float* p_array,
    float xi, float dt, int* J_indices, float* J_values, int* J_crow_ptr,
    int N, int batch_size, int n_iterations
) {
    // 检查共享内存需求
    int max_threads_per_block;
    if (!check_multi_row_shared_memory_support(N, &max_threads_per_block)) {
        return false;
    }

    size_t shared_mem_needed = 2 * N * sizeof(float);

    printf("✅ 多行共享内存使用: %zu字节, N=%d\\n", shared_mem_needed, N);
    printf("   每个线程处理 %.1f 个row\\n", (float)N / max_threads_per_block);

    // ⭐ 启动配置：每个batch一个block，使用最大线程数
    dim3 blocks(1, batch_size);
    dim3 threads(max_threads_per_block, 1);  // 使用最大线程数(通常1024)

    fused_dynamics_xy_shared_multi_row_kernel<<<blocks, threads, shared_mem_needed>>>(
        y, x, delta, p_array, xi, dt,
        J_indices, J_values, J_crow_ptr,
        N, batch_size, n_iterations
    );

    cudaError_t result = cudaGetLastError();
    return result == cudaSuccess;
}

// 🔄 修改wrapper函数，优先使用多行共享内存版本
void fused_dynamics_batch_iterations_cuda_wrapper(
    torch::Tensor y,
    torch::Tensor x,
    float delta,
    torch::Tensor p_array,
    float xi,
    float dt,
    torch::Tensor J_indices,
    torch::Tensor J_values,
    torch::Tensor J_crow_ptr,
    int n_iterations
) {
    const int N = x.size(0);
    const int batch_size = x.size(1);

    // ⭐ 检查多行共享内存支持
    int max_threads_per_block;
    static bool multi_row_shared_supported = check_multi_row_shared_memory_support(N, &max_threads_per_block);
    static bool support_check_done = false;

    if (!support_check_done) {
        if (multi_row_shared_supported) {
            printf("✅ 使用多行共享内存优化版本 (支持N>1024)\\n");
        } else {
            printf("⚠️ 不支持多行共享内存优化，使用传统版本\\n");
        }
        support_check_done = true;
    }

    if (multi_row_shared_supported) {
        // ⭐ 使用多行共享内存优化版本
        bool success = launch_multi_row_shared_kernel_wrapper(
            y.data_ptr<float>(), x.data_ptr<float>(),
            delta, p_array.data_ptr<float>(), xi, dt,
            J_indices.data_ptr<int>(), J_values.data_ptr<float>(),
            J_crow_ptr.data_ptr<int>(),
            N, batch_size, n_iterations
        );

        if (success) {
            cudaDeviceSynchronize();
            return;
        } else {
            printf("❌ 多行共享内存版本启动失败，使用fallback...\\n");
        }
    }

    // 🔄 Fallback：使用传统的2D grid配置
    const int threads_x = 16;
    const int threads_y = 16;

    dim3 threads(threads_x, threads_y);
    dim3 blocks(
        (N + threads_x - 1) / threads_x,
        (batch_size + threads_y - 1) / threads_y
    );

    fused_dynamics_fallback_iterations_kernel<<<blocks, threads>>>(
        y.data_ptr<float>(),
        x.data_ptr<float>(),
        delta,
        p_array.data_ptr<float>(),
        xi, dt,
        J_indices.data_ptr<int>(),
        J_values.data_ptr<float>(),
        J_crow_ptr.data_ptr<int>(),
        N, batch_size, n_iterations
    );

    cudaDeviceSynchronize();
}
'''

# 🔄 C++绑定代码
cpp_source_multi_row = '''
#include <torch/extension.h>

bool check_multi_row_shared_memory_support(int N, int* max_threads_per_block);
bool launch_multi_row_shared_kernel_wrapper(
    float* y, float* x, float delta, float* p_array,
    float xi, float dt, int* J_indices, float* J_values, int* J_crow_ptr,
    int N, int batch_size, int n_iterations
);

void fused_dynamics_batch_iterations_cuda_wrapper(
    torch::Tensor y,
    torch::Tensor x,
    float delta,
    torch::Tensor p_array,
    float xi,
    float dt,
    torch::Tensor J_indices,
    torch::Tensor J_values,
    torch::Tensor J_crow_ptr,
    int n_iterations
);

void fused_dynamics_batch_iterations_cpu(
    torch::Tensor y,
    torch::Tensor x,
    float delta,
    torch::Tensor p_array,
    float xi,
    float dt,
    torch::Tensor J_indices,
    torch::Tensor J_values,
    torch::Tensor J_crow_ptr,
    int n_iterations
) {
    auto y_acc = y.accessor<float, 2>();
    auto x_acc = x.accessor<float, 2>();
    auto p_acc = p_array.accessor<float, 1>();
    auto J_indices_acc = J_indices.accessor<int, 1>();
    auto J_values_acc = J_values.accessor<float, 1>();
    auto J_crow_ptr_acc = J_crow_ptr.accessor<int, 1>();

    int N = x.size(0);
    int batch_size = x.size(1);

    for (int iter = 0; iter < n_iterations; iter++) {
        float p_i = p_acc[iter];

        auto y_temp = torch::zeros_like(y);
        auto x_temp = torch::zeros_like(x);
        auto y_temp_acc = y_temp.accessor<float, 2>();
        auto x_temp_acc = x_temp.accessor<float, 2>();

        #pragma omp parallel for collapse(2)
        for (int row = 0; row < N; row++) {
            for (int batch = 0; batch < batch_size; batch++) {
                float x_val = x_acc[row][batch];
                float y_val = y_acc[row][batch];

                float sparse_result = 0.0f;
                int start = J_crow_ptr_acc[row];
                int end = J_crow_ptr_acc[row + 1];

                for (int j = start; j < end; j++) {
                    int col_idx = J_indices_acc[j];
                    float col_x = x_acc[col_idx][batch];
                    float col_sign = (col_x > 0.0f) ? 1.0f : ((col_x < 0.0f) ? -1.0f : 0.0f);
                    sparse_result += J_values_acc[j] * col_sign;
                }

                float term1 = -(delta - p_i) * x_val;
                float term2 = xi * sparse_result;
                y_val += (term1 + term2) * dt;
                x_val += dt * y_val * delta;

                if (std::abs(x_val) > 1.0f) {
                    x_val = (x_val > 0.0f) ? 1.0f : -1.0f;
                    y_val = 0.0f;
                }

                y_temp_acc[row][batch] = y_val;
                x_temp_acc[row][batch] = x_val;
            }
        }

        y.copy_(y_temp);
        x.copy_(x_temp);
    }
}

bool check_device_multi_row_shared_memory_support(int N) {
    int max_threads_per_block;
    return check_multi_row_shared_memory_support(N, &max_threads_per_block);
}

void fused_dynamics_batch_iterations(
    torch::Tensor y,
    torch::Tensor x,
    float delta,
    torch::Tensor p_array,
    float xi,
    float dt,
    torch::Tensor J_indices,
    torch::Tensor J_values,
    torch::Tensor J_crow_ptr,
    int n_iterations
) {
    if (y.is_cuda()) {
        fused_dynamics_batch_iterations_cuda_wrapper(
            y, x, delta, p_array, xi, dt,
            J_indices, J_values, J_crow_ptr, n_iterations
        );
    } else {
        fused_dynamics_batch_iterations_cpu(
            y, x, delta, p_array, xi, dt,
            J_indices, J_values, J_crow_ptr, n_iterations
        );
    }
}
'''

class MultiRowSharedMemoryFusedJIT:
    """⭐ 支持大N的共享内存版本：每个线程处理多个row"""

    _module = None
    _compilation_attempted = False

    def __init__(self, J, use_cuda=True, verbose=False):
        self.use_cuda = use_cuda and torch.cuda.is_available()
        self.verbose = verbose

        # 预处理稀疏矩阵
        if hasattr(J, 'to_sparse_csr'):
            self.J_csr = J.to_sparse_csr()
        else:
            self.J_csr = J.coalesce()

        # 编译CUDA扩展
        if self.use_cuda:
            self._compile_cuda_extension()

        # PyTorch回退
        self.torch_update = self._single_step_update

    def _compile_cuda_extension(self):
        if MultiRowSharedMemoryFusedJIT._compilation_attempted:
            return

        MultiRowSharedMemoryFusedJIT._compilation_attempted = True

        try:
            if self.verbose:
                print("正在编译多行共享内存CUDA扩展...")

            MultiRowSharedMemoryFusedJIT._module = load_inline(
                name='multi_row_shared_fused_dynamics_jit',
                cpp_sources=[cpp_source_multi_row],
                cuda_sources=[cuda_source_multi_row_shared] if self.use_cuda else [],
                functions=['fused_dynamics_batch_iterations', 'check_device_multi_row_shared_memory_support'],
                extra_cflags=['-O3', '-std=c++17', '-fopenmp'],
                extra_cuda_cflags=[
                    '-O3', '--std=c++17',
                    '-gencode=arch=compute_70,code=sm_70',
                    '-gencode=arch=compute_75,code=sm_75',
                    '-gencode=arch=compute_80,code=sm_80',
                    '-gencode=arch=compute_86,code=sm_86',
                    '--use_fast_math',
                    '--extended-lambda'
                ] if self.use_cuda else [],
                verbose=self.verbose
            )

            if self.verbose:
                print("✅ 多行共享内存CUDA扩展编译完成！")

        except Exception as e:
            if self.verbose:
                print(f"❌ CUDA编译失败，使用PyTorch回退: {e}")
            MultiRowSharedMemoryFusedJIT._module = None

    def is_multi_row_shared_memory_supported(self, N):
        """检查多行共享内存优化是否支持给定的N"""
        if MultiRowSharedMemoryFusedJIT._module is None:
            return False
        try:
            return MultiRowSharedMemoryFusedJIT._module.check_device_multi_row_shared_memory_support(N)
        except:
            return False

    def run_iterations(self, y, x, delta, p_array, xi, dt, n_iterations):
        """运行多个时间步"""

        if MultiRowSharedMemoryFusedJIT._module is not None and y.is_cuda:
            return self._cuda_iterations(y, x, delta, p_array, xi, dt, n_iterations)
        else:
            return self._torch_iterations(y, x, delta, p_array, xi, dt)

    def _cuda_iterations(self, y, x, delta, p_array, xi, dt, n_iterations):
        """⭐ CUDA版本：多行共享内存优化"""
        # 确保数据类型和连续性
        if y.dtype != torch.float32:
            y = y.float()
        if x.dtype != torch.float32:
            x = x.float()
        if p_array.dtype != torch.float32:
            p_array = p_array.float()

        if not y.is_contiguous():
            y = y.contiguous()
        if not x.is_contiguous():
            x = x.contiguous()
        if not p_array.is_contiguous():
            p_array = p_array.contiguous()

        # 获取CSR格式数据
        if hasattr(self.J_csr, 'crow_indices'):
            crow_indices = self.J_csr.crow_indices().int().contiguous()
            col_indices = self.J_csr.col_indices().int().contiguous()
            values = self.J_csr.values().float().contiguous()
        else:
            raise NotImplementedError("请使用PyTorch 1.13+")

        # 确保数据在GPU上
        if not crow_indices.is_cuda:
            crow_indices = crow_indices.cuda()
        if not col_indices.is_cuda:
            col_indices = col_indices.cuda()
        if not values.is_cuda:
            values = values.cuda()
        if not p_array.is_cuda:
            p_array = p_array.cuda()

        # ⭐ 多行共享内存优化版本
        MultiRowSharedMemoryFusedJIT._module.fused_dynamics_batch_iterations(
            y, x, float(delta), p_array, float(xi), float(dt),
            col_indices, values, crow_indices, n_iterations
        )

        return y, x

    def _torch_iterations(self, y, x, delta, p_array, xi, dt):
        """传统PyTorch版本"""
        for i in range(len(p_array)):
            y, x = self.torch_update(y, x, delta, p_array[i].item(), xi, dt)
        return y, x

    def _single_step_update(self, y, x, delta, p_i, xi, dt):
        """单步更新的PyTorch实现"""
        sign_x = torch.sign(x)
        sparse_term = torch.sparse.mm(self.J_csr, sign_x)
        y = y + (-(delta - p_i) * x + xi * sparse_term) * dt
        x = x + dt * y * delta

        cond = torch.abs(x) > 1
        x = torch.where(cond, torch.sign(x), x)
        y = torch.where(cond, torch.zeros_like(x), y)

        return y, x

def demo_multi_row_shared_memory():
    """演示支持大N的共享内存版本"""
    print("=== 支持大N的多行共享内存版本 ===")

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # ⭐ 测试多种N值，包括>1024的情况
    test_cases = [
        {"N": 512, "batch_size": 1000, "n_iterations": 500},
        {"N": 1024, "batch_size": 500, "n_iterations": 500},
        {"N": 2048, "batch_size": 100, "n_iterations": 200},  # >1024
        {"N": 4096, "batch_size": 50, "n_iterations": 100},   # 更大的N
    ]

    print(f"设备: {device}")

    for test_case in test_cases:
        N = test_case["N"]
        batch_size = test_case["batch_size"]
        n_iterations = test_case["n_iterations"]

        print(f"\n--- 测试 N={N}, batch_size={batch_size}, iterations={n_iterations} ---")
        print(f"共享内存需求: {2*N*4}字节")
        print(f"每个线程处理: {N/1024:.1f} 个row")

        # 创建测试数据
        nnz = N * 8  # 平均每行8个非零元素
        indices = torch.randint(0, N, (2, nnz), device=device)
        values = torch.randn(nnz, device=device) * 0.1
        J = torch.sparse_coo_tensor(indices, values, (N, N), device=device)

        y = torch.randn(N, batch_size, device=device)
        x = torch.randn(N, batch_size, device=device)

        delta, xi, dt = 0.1, 0.5, 0.01
        p_array = torch.randn(n_iterations, device=device)

        # ⭐ 使用多行共享内存版本
        multi_row_updater = MultiRowSharedMemoryFusedJIT(J, use_cuda=True, verbose=False)

        # 检查共享内存支持
        if device == 'cuda':
            support = multi_row_updater.is_multi_row_shared_memory_supported(N)
            print(f"多行共享内存支持: {'✅ 支持' if support else '❌ 不支持'}")

            if support:
                y1, x1 = y.clone(), x.clone()

                torch.cuda.synchronize()
                start = torch.cuda.Event(enable_timing=True)
                end = torch.cuda.Event(enable_timing=True)
                start.record()

                y1, x1 = multi_row_updater.run_iterations(y1, x1, delta, p_array, xi, dt, n_iterations)

                end.record()
                torch.cuda.synchronize()
                elapsed_time = start.elapsed_time(end)

                print(f"多行共享内存耗时: {elapsed_time:.2f}ms")
                print(f"平均每次迭代: {elapsed_time/n_iterations:.3f}ms")

                # 计算吞吐量
                total_ops = N * batch_size * n_iterations
                throughput = total_ops / (elapsed_time / 1000) / 1e6  # MOps/s
                print(f"吞吐量: {throughput:.1f} MOps/s")
            else:
                print("⚠️ 共享内存不足，跳过测试")
        else:
            print("CPU模式，跳过共享内存测试")

def compare_with_traditional(N=2048, batch_size=100, n_iterations=200):
    """与传统方法比较性能"""
    print(f"\n=== 性能对比 (N={N}) ===")

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if device == 'cpu':
        print("需要CUDA设备进行性能对比")
        return

    # 创建测试数据
    nnz = N * 8
    indices = torch.randint(0, N, (2, nnz), device=device)
    values = torch.randn(nnz, device=device) * 0.1
    J = torch.sparse_coo_tensor(indices, values, (N, N), device=device)

    y = torch.randn(N, batch_size, device=device)
    x = torch.randn(N, batch_size, device=device)

    delta, xi, dt = 0.1, 0.5, 0.01
    p_array = torch.randn(n_iterations, device=device)

    # 方法1: 多行共享内存版本
    print("\n--- 方法1: 多行共享内存版本 ---")
    multi_row_updater = MultiRowSharedMemoryFusedJIT(J, use_cuda=True, verbose=True)

    if multi_row_updater.is_multi_row_shared_memory_supported(N):
        y1, x1 = y.clone(), x.clone()

        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()

        y1, x1 = multi_row_updater.run_iterations(y1, x1, delta, p_array, xi, dt, n_iterations)

        end.record()
        torch.cuda.synchronize()
        shared_time = start.elapsed_time(end)

        print(f"多行共享内存耗时: {shared_time:.2f}ms")

        # 方法2: 传统方式
        print("\n--- 方法2: 传统方式 ---")
        y2, x2 = y.clone(), x.clone()

        start.record()

        for i in range(n_iterations):
            # 模拟传统的单步更新
            sign_x = torch.sign(x2)
            if hasattr(J, 'to_sparse_csr'):
                J_csr = J.to_sparse_csr()
            else:
                J_csr = J.coalesce()
            sparse_term = torch.sparse.mm(J_csr, sign_x)
            y2 = y2 + (-(delta - p_array[i].item()) * x2 + xi * sparse_term) * dt
            x2 = x2 + dt * y2 * delta

            cond = torch.abs(x2) > 1
            x2 = torch.where(cond, torch.sign(x2), x2)
            y2 = torch.where(cond, torch.zeros_like(x2), y2)

        end.record()
        torch.cuda.synchronize()
        traditional_time = start.elapsed_time(end)

        print(f"传统方式耗时: {traditional_time:.2f}ms")

        # 性能分析
        speedup = traditional_time / shared_time
        print(f"\n🚀 多行共享内存加速比: {speedup:.2f}x")
        print(f"节省时间: {traditional_time - shared_time:.2f}ms ({(1-shared_time/traditional_time)*100:.1f}%)")

        # 验证数值一致性
        y_diff = torch.max(torch.abs(y1 - y2)).item()
        x_diff = torch.max(torch.abs(x1 - x2)).item()
        print(f"\n数值差异: y={y_diff:.2e}, x={x_diff:.2e}")

        if y_diff < 1e-4 and x_diff < 1e-4:
            print("✅ 数值验证通过!")
        else:
            print("⚠️  数值有轻微差异（在GPU并行计算中是正常的）")
    else:
        print("❌ 不支持多行共享内存优化")

if __name__ == "__main__":
    demo_multi_row_shared_memory()
    compare_with_traditional()