diff --git a/lib/Conversion/TopToTpu/BM1684X/MatMul.cpp b/lib/Conversion/TopToTpu/BM1684X/MatMul.cpp
index 3c2be5052..c803a7ebd 100644
--- a/lib/Conversion/TopToTpu/BM1684X/MatMul.cpp
+++ b/lib/Conversion/TopToTpu/BM1684X/MatMul.cpp
@@ -29,6 +29,7 @@ void MatMulLowering::LoweringINT8(PatternRewriter &rewriter, top::MatMulOp op,
     if (bias_size > p.N)
       llvm_unreachable("BatchMatMul does not support batch-bias yet.");
   }
+  int64_t left_num_dims = module::getShape(op.getInput()).size();
   if (auto filterOp = dyn_cast<top::WeightOp>(op.getRight().getDefiningOp())) {
     auto filter_f32 = filterOp.read<float>();
     int64_t in_zp = 0, out_zp = 0;
@@ -89,7 +90,8 @@ void MatMulLowering::LoweringINT8(PatternRewriter &rewriter, top::MatMulOp op,
     operands.push_back(new_filter);
     auto new_bias = op.getBias();
     if (with_bias) {
-      std::vector<int64_t> shape = {p.N};
+      std::vector<int64_t> shape(left_num_dims, 1);
+      shape[left_num_dims - 1]= p.N;
       auto new_type = RankedTensorType::get(shape, rewriter.getI32Type());
       new_bias = top::WeightOp::create(op, "bias_int32", *bias_int32, new_type);
       operands.push_back(new_bias);
@@ -119,7 +121,9 @@ void MatMulLowering::LoweringINT8(PatternRewriter &rewriter, top::MatMulOp op,
         bias_int32->data()[j] =
             std::round(bias_fp32->at(j) / (w_scale * in_scale));
       }
-      auto new_type = RankedTensorType::get({bias_n}, rewriter.getI32Type());
+      std::vector<int64_t> shape(left_num_dims, 1);
+      shape[left_num_dims - 1]= bias_n;
+      auto new_type = RankedTensorType::get(shape, rewriter.getI32Type());
       auto new_bias =
           top::WeightOp::create(op, "bias_int32", *bias_int32, new_type);
       operands[2] = new_bias;
@@ -161,6 +165,7 @@ void MatMulLowering::LoweringINT4(PatternRewriter &rewriter, top::MatMulOp op,
   int64_t in_zp = 0, out_zp = 0;
   double in_scale = 1, out_scale = 1, w_scale = 1;
 
+  int64_t left_num_dims = module::getShape(op.getInput()).size();
   if (auto filterOp = dyn_cast<top::WeightOp>(op.getRight().getDefiningOp())) {
     auto filter_f32 = filterOp.read<float>();
     int bitwidth = 4;
@@ -283,7 +288,8 @@ void MatMulLowering::LoweringINT4(PatternRewriter &rewriter, top::MatMulOp op,
     operands.push_back(new_filter);
     auto new_bias = op.getBias();
     if (with_bias) {
-      std::vector<int64_t> shape = {p.N};
+      std::vector<int64_t> shape(left_num_dims, 1);
+      shape[left_num_dims - 1]= p.N;
       auto new_type = RankedTensorType::get(shape, rewriter.getI32Type());
       new_bias = top::WeightOp::create(op, "bias_int32", *bias_int32, new_type);
       operands.push_back(new_bias);
@@ -313,7 +319,9 @@ void MatMulLowering::LoweringINT4(PatternRewriter &rewriter, top::MatMulOp op,
         bias_int32->data()[j] =
             std::round(bias_fp32->at(j) / (w_scale * in_scale));
       }
-      auto new_type = RankedTensorType::get({bias_n}, rewriter.getI32Type());
+      std::vector<int64_t> shape(left_num_dims, 1);
+      shape[left_num_dims - 1]= bias_n;
+      auto new_type = RankedTensorType::get(shape, rewriter.getI32Type());
       auto new_bias =
           top::WeightOp::create(op, "bias_int32", *bias_int32, new_type);
       operands[2] = new_bias;
@@ -391,6 +399,7 @@ void MatMulLowering::LoweringQuantized(PatternRewriter &rewriter,
   auto input_qtype = module::getUniformQuantizedType(op.getInput());
   auto right_qtype = module::getUniformQuantizedType(op.getRight());
   auto output_qtype = module::getUniformQuantizedType(op.getOutput());
+  int64_t left_num_dims = module::getShape(op.getInput()).size();
 
   const double real_multiplier =
       input_qtype.getScale() * right_qtype.getScale() / output_qtype.getScale();
@@ -445,7 +454,9 @@ void MatMulLowering::LoweringQuantized(PatternRewriter &rewriter,
   } else {
     bias_quant = i32_array_t(new std::vector<int32_t>(col_size, 0));
   }
-  auto bias_type = RankedTensorType::get({col_size}, rewriter.getI32Type());
+  std::vector<int64_t> shape(left_num_dims, 1);
+  shape[left_num_dims - 1]= col_size;
+  auto bias_type = RankedTensorType::get(shape, rewriter.getI32Type());
 
   if (can_merge_izp) {
     //    attrs.push_back(rewriter.getNamedAttr(
diff --git a/lib/Dialect/Tpu/Interfaces/BM1684X/MatMul.cpp b/lib/Dialect/Tpu/Interfaces/BM1684X/MatMul.cpp
index b040455e8..f27fb936c 100644
--- a/lib/Dialect/Tpu/Interfaces/BM1684X/MatMul.cpp
+++ b/lib/Dialect/Tpu/Interfaces/BM1684X/MatMul.cpp
@@ -60,8 +60,10 @@ LogicalResult WeightReorder<tpu::MatMulOp, int8_t>::matchAndRewrite(
       bias_quant->data()[i] += p.input_zp * p.right_zp * p.K;
     }
     auto stype = module::getStorageType(op.getBias());
-    // std::vector<int64_t> bias_shape = {N};
-    auto new_type = RankedTensorType::get({p.N}, rewriter.getI32Type());
+    int64_t left_num_dims = module::getShape(op.getInput()).size();
+    std::vector<int64_t> bias_shape(left_num_dims, 1);
+    bias_shape[left_num_dims - 1] = p.N;
+    auto new_type = RankedTensorType::get(bias_shape, rewriter.getI32Type());
     auto new_op =
         top::WeightOp::create(op, "bias_merge_izp", *bias_quant, new_type);
     op->setOperand(2, new_op);
@@ -258,8 +260,7 @@ void tpu::MatMulOp::codegen_local_bm1684x(int64_t n_step, int64_t h_step,
   common.L_trans = getLeftTranspose();
   common.R_trans = p.right_transpose;
   common.has_bias = p.with_bias;
-  common.hdim_is_batch =
-      false; // group_type == GROUP_SMALL_C ? true : getHdimIsBatch();
+  common.hdim_is_batch = false;
   common.requant_mode = -1;
   if (module::isUniformQuantized(getInput())) {
     common.R_zp_is_const = true;
diff --git a/lib/Dialect/Tpu/Interfaces/Common/MatMul.cpp b/lib/Dialect/Tpu/Interfaces/Common/MatMul.cpp
index 8416372ab..a16a356f3 100644
--- a/lib/Dialect/Tpu/Interfaces/Common/MatMul.cpp
+++ b/lib/Dialect/Tpu/Interfaces/Common/MatMul.cpp
@@ -166,13 +166,20 @@ LogicalResult tpu::MatMulOp::LocalGenSupport() {
     return failure();
   }
 
+  auto Lshape = module::getShape(ins[0]);
+  auto Rshape = module::getShape(ins[1]);
   int left_num_dims = module::getShape(ins[0]).size();
   int right_num_dims = module::getShape(ins[1]).size();
-  if (left_num_dims == 5 && right_num_dims == 2) {
+  if (((left_num_dims == 4 && Lshape[1] < Lshape[2]) ||
+       (left_num_dims == 5 && Lshape[1] < Lshape[3])) &&
+      right_num_dims == 2) {
+    // GROUP_SMALL_C
     return success();
-  } else if (left_num_dims == 3 && right_num_dims == 2) {
+  } else if (left_num_dims == 3 && right_num_dims == 3) {
+    // (1, M, K) x (1, K, N)
     return success();
   } else if (left_num_dims == 4 && right_num_dims == 4 && getHdimIsBatch()) {
+    // (B1, M, B2, K) x (B1, K, B2, N)
     return success();
   }
   return failure();
diff --git a/python/transform/TFLiteConverter.py b/python/transform/TFLiteConverter.py
index cd59bbdfc..29177c526 100644
--- a/python/transform/TFLiteConverter.py
+++ b/python/transform/TFLiteConverter.py
@@ -680,6 +680,12 @@ def fully_connected_op(self, op):
             "do_relu": BoolAttr.get(fused_active == 1),
             # "right_transpose": BoolAttr.get(True),
         }
+        if op.inputs[2] is not None:
+            bias_shape = [1] * len(op.inputs[0].shape)
+            bias_shape[-1] = op.inputs[2].shape[0]
+            op.inputs[2].shape = tuple(bias_shape)
+            op.inputs[2].buffer.shape = tuple(bias_shape)
+
         if op.inputs[1].buffer is not None:
             f, c = op.inputs[1].shape
             op.inputs[1].shape = (c, f)