Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix converting latin1 to utf8 charset checksum failures #373

Merged
merged 3 commits into from
Feb 15, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ linters:
- exportloopref

linters-settings:
govet:
check-shadowing: false
dupl:
threshold: 100
gocyclo:
Expand All @@ -69,8 +67,6 @@ linters-settings:
- require-error

issues:
max-per-linter: 0
max-same: 0
exclude-use-default: false
exclude:
- '^(G104|G204):'
Expand Down
8 changes: 4 additions & 4 deletions pkg/dbconn/sqlescape/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,25 +464,25 @@ func TestEscapeString(t *testing.T) {
}

func BenchmarkEscapeString(b *testing.B) {
for i := 0; i < b.N; i++ {
for range b.N {
escapeSQL("select %?", "3") //nolint:errcheck
}
}

func BenchmarkUnderlyingString(b *testing.B) {
for i := 0; i < b.N; i++ {
for range b.N {
escapeSQL("select %?", myStr("3")) //nolint:errcheck
}
}

func BenchmarkEscapeInt(b *testing.B) {
for i := 0; i < b.N; i++ {
for range b.N {
escapeSQL("select %?", 3) //nolint:errcheck
}
}

func BenchmarkUnderlyingInt(b *testing.B) {
for i := 0; i < b.N; i++ {
for range b.N {
escapeSQL("select %?", myInt(3)) //nolint:errcheck
}
}
41 changes: 41 additions & 0 deletions pkg/migration/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,44 @@ func TestBinaryChecksum(t *testing.T) {
assert.NoError(t, err)
}
}

// TestConvertCharset tests that we can change the character set from latin1 to utf8mb4,
// and that the non 7-bit characters that can be represented in latin1 as 1 byte,
// checksum correctly against their multi-byte utf8mb4 representations
func TestConvertCharset(t *testing.T) {
testutils.RunSQL(t, `DROP TABLE IF EXISTS t1charset, _t1charset_new`)
table := `CREATE TABLE t1charset (
id int not null primary key auto_increment,
b varchar(100) not null
) charset=latin1`
testutils.RunSQL(t, table)
testutils.RunSQL(t, `insert into t1charset values (null, 'à'), (null, '€')`)
migration := &Migration{}
cfg, err := mysql.ParseDSN(testutils.DSN())
assert.NoError(t, err)
migration.Host = cfg.Addr
migration.Username = cfg.User
migration.Password = cfg.Passwd
migration.Database = cfg.DBName
migration.Threads = 1
migration.Checksum = true
migration.Table = "t1charset"
migration.Alter = "CONVERT TO CHARACTER SET UTF8MB4"
err = migration.Run()
assert.NoError(t, err)

// Because utf8mb4 is the superset, it doesn't matter that thats
// what the checksum casts to. We should be able to convert back as well.
migration = &Migration{
Host: cfg.Addr,
Username: cfg.User,
Password: cfg.Passwd,
Database: cfg.DBName,
Threads: 1,
Checksum: true,
Table: "t1charset",
Alter: "CONVERT TO CHARACTER SET latin1",
}
err = migration.Run()
assert.NoError(t, err)
}
2 changes: 1 addition & 1 deletion pkg/table/chunker_composite.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ import (
"errors"
"fmt"
"reflect"
"slices"
"strings"
"sync"
"time"

"github.com/siddontang/loggers"
"golang.org/x/exp/slices"
)

type chunkerComposite struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/table/chunker_optimistic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestOptimisticChunkerBasic(t *testing.T) {

assert.False(t, chunker.KeyAboveHighWatermark(100)) // we are at 1001

for i := 0; i <= 998; i++ {
for range 999 {
_, err = chunker.Next()
assert.NoError(t, err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/table/tableinfo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func TestDiscovery(t *testing.T) {
// normalize for mysql 5.7 and 8.0
assert.Equal(t, "int", removeWidth(t1.columnsMySQLTps["id"]))
assert.Equal(t, "CAST(`id` AS signed)", t1.WrapCastType("id"))
assert.Equal(t, "CAST(`name` AS char)", t1.WrapCastType("name"))
assert.Equal(t, "CAST(`name` AS char CHARACTER SET utf8mb4)", t1.WrapCastType("name"))

assert.Equal(t, "1", t1.minValue.String())
assert.Equal(t, "3", t1.maxValue.String())
Expand Down
7 changes: 4 additions & 3 deletions pkg/table/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ func castableTp(tp string) string {
return "unsigned"
case "timestamp", "datetime":
return "datetime"
case "varchar", "enum", "set", "text", "mediumtext", "longtext":
return "char"
case "tinyblob", "blob", "mediumblob", "longblob", "varbinary":
return "binary"
case "binary":
Expand All @@ -47,7 +45,10 @@ func castableTp(tp string) string {
case "decimal":
return tp
default:
return "char" // default to char because it ends up being a char string in the concat anyway.
// For cases like varchar, enum, set, text, mediumtext, longtext
// We return char, but because the new table could also change charset we explicitly
// convert to utf8mb4 which should be the superset, and can do all comparisons.
return "char CHARACTER SET utf8mb4"
}
}

Expand Down
18 changes: 9 additions & 9 deletions pkg/table/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,33 +42,33 @@ func TestCastableTp(t *testing.T) {
{"bigint unsigned", "unsigned"},
{"timestamp", "datetime"},
{"timestamp(6)", "datetime"},
{"varchar(100)", "char"},
{"text", "char"},
{"mediumtext", "char"},
{"longtext", "char"},
{"varchar(100)", "char CHARACTER SET utf8mb4"},
{"text", "char CHARACTER SET utf8mb4"},
{"mediumtext", "char CHARACTER SET utf8mb4"},
{"longtext", "char CHARACTER SET utf8mb4"},
{"tinyblob", "binary"},
{"blob", "binary"},
{"mediumblob", "binary"},
{"longblob", "binary"},
{"varbinary", "binary"},
{"char(100)", "char"},
{"char(100)", "char CHARACTER SET utf8mb4"},
{"binary(100)", "binary(0)"},
{"datetime", "datetime"},
{"datetime(6)", "datetime"},
{"year", "char"},
{"year", "char CHARACTER SET utf8mb4"},
{"float", "char"},
{"double", "char"},
{"json", "json"},
{"int(11)", "signed"},
{"int(11) unsigned", "unsigned"},
{"int(11) zerofill", "signed"},
{"int(11) unsigned zerofill", "unsigned"},
{"enum('a', 'b', 'c')", "char"},
{"set('a', 'b', 'c')", "char"},
{"enum('a', 'b', 'c')", "char CHARACTER SET utf8mb4"},
{"set('a', 'b', 'c')", "char CHARACTER SET utf8mb4"},
{"decimal(6,2)", "decimal(6,2)"},
}
for _, tp := range tps {
assert.Equal(t, tp.expected, castableTp(tp.tp))
assert.Equal(t, tp.expected, castableTp(tp.tp), "tp failed: %s, expected: %s", tp.tp, tp.expected)
}
}

Expand Down