diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 481c64f..5c2cf2b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - go-version: ['1.16', '1.17'] + go-version: ['1.21'] runs-on: ${{ matrix.platform }} steps: - name: Setup Go @@ -21,14 +21,15 @@ jobs: uses: actions/checkout@v4 - name: Download Go dependencies - run: go mod download + run: go mod download -x env: GOPROXY: "https://proxy.golang.org" - name: Lint uses: golangci/golangci-lint-action@v6.1.1 with: - version: v1.44 + version: v1.62.2 + args: --timeout 3m --verbose - name: Build run: go build -v . diff --git a/README.md b/README.md index e80857d..581e8bd 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,9 @@ instead of `New()`. shortuuid.NewWithNamespace("http://example.com") ``` -It's possible to use a custom alphabet as well, though it has to be 57 -characters long. +It's possible to use a custom alphabet as well (at least 2 +characters long). +It will automatically sort and remove duplicates from your alphabet to ensure consistency ```go alphabet := "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxy=" diff --git a/alphabet.go b/alphabet.go index 1e5356d..4ee3ef4 100644 --- a/alphabet.go +++ b/alphabet.go @@ -2,33 +2,44 @@ package shortuuid import ( "fmt" - "sort" - "strings" + "math" + "slices" ) // DefaultAlphabet is the default alphabet used. -const DefaultAlphabet = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" +const ( + DefaultAlphabet = "23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" + rune1Max = 1<<7 - 1 +) type alphabet struct { - chars [57]rune - len int64 + chars []rune + len int64 + encLen int64 + singleBytes bool } -// Remove duplicates and sort it to ensure reproducability. +// Remove duplicates and sort it to ensure reproducibility. func newAlphabet(s string) alphabet { - abc := dedupe(strings.Split(s, "")) + abc := []rune(s) + slices.Sort(abc) + abc = slices.Compact(abc) - if len(abc) != 57 { - panic("encoding alphabet is not 57-bytes long") + if len(abc) < 2 { + panic("encoding alphabet must be at least two characters") } - sort.Strings(abc) a := alphabet{ - len: int64(len(abc)), + chars: abc, + len: int64(len(abc)), + encLen: int64(math.Ceil(128 / math.Log2(float64(len(abc))))), + singleBytes: true, } - - for i, char := range strings.Join(abc, "") { - a.chars[i] = char + for _, c := range a.chars { + if c > rune1Max { + a.singleBytes = false + break + } } return a @@ -41,25 +52,17 @@ func (a *alphabet) Length() int64 { // Index returns the index of the first instance of t in the alphabet, or an // error if t is not present. func (a *alphabet) Index(t rune) (int64, error) { - for i, char := range a.chars { - if char == t { - return int64(i), nil + i, j := 0, int(a.len) + for i < j { + h := int(uint(i+j) >> 1) + if a.chars[h] < t { + i = h + 1 + } else { + j = h } } - return 0, fmt.Errorf("element '%v' is not part of the alphabet", t) -} - -// dudupe removes duplicate characters from s. -func dedupe(s []string) []string { - var out []string - m := make(map[string]bool) - - for _, char := range s { - if _, ok := m[char]; !ok { - m[char] = true - out = append(out, char) - } + if i >= int(a.len) || a.chars[i] != t { + return 0, fmt.Errorf("element '%v' is not part of the alphabet", t) } - - return out + return int64(i), nil } diff --git a/alphabet_test.go b/alphabet_test.go index fab1522..0ab3422 100644 --- a/alphabet_test.go +++ b/alphabet_test.go @@ -1,26 +1,9 @@ package shortuuid import ( - "strings" "testing" ) -func TestDedupe(t *testing.T) { - tests := []struct { - in, out string - }{ - {"01010101010101", "01"}, - {"abcabcfoo", "abcfo"}, - } - - for _, test := range tests { - in := strings.Join(dedupe(strings.Split(test.in, "")), "") - if in != test.out { - t.Errorf("expected %q, got %q", in, test.out) - } - } -} - func TestAlphabetIndex(t *testing.T) { abc := newAlphabet(DefaultAlphabet) idx, err := abc.Index('z') diff --git a/base57.go b/base57.go deleted file mode 100644 index 6fe7482..0000000 --- a/base57.go +++ /dev/null @@ -1,82 +0,0 @@ -package shortuuid - -import ( - "encoding/binary" - "fmt" - "github.com/google/uuid" - "math/bits" - "strings" -) - -type base57 struct { - // alphabet is the character set to construct the UUID from. - alphabet alphabet -} - -const ( - strLen = 22 - alphabetLen = 57 -) - -// Encode encodes uuid.UUID into a string using the most significant bits (MSB) -// first according to the alphabet. -func (b base57) Encode(u uuid.UUID) string { - num := uint128{ - binary.BigEndian.Uint64(u[8:]), - binary.BigEndian.Uint64(u[:8]), - } - var outIndexes [strLen]uint64 - - for i := strLen - 1; num.Hi > 0 || num.Lo > 0; i-- { - num, outIndexes[i] = num.quoRem64(alphabetLen) - } - - var sb strings.Builder - sb.Grow(strLen) - for i := 0; i < strLen; i++ { - sb.WriteRune(b.alphabet.chars[outIndexes[i]]) - } - return sb.String() -} - -// Decode decodes a string according to the alphabet into a uuid.UUID. If s is -// too short, its most significant bits (MSB) will be padded with 0 (zero). -func (b base57) Decode(s string) (u uuid.UUID, err error) { - var n uint128 - var index int64 - - for _, char := range s { - index, err = b.alphabet.Index(char) - if err != nil { - return - } - n, err = n.mulAdd64(alphabetLen, uint64(index)) - if err != nil { - return - } - } - binary.BigEndian.PutUint64(u[:8], n.Hi) - binary.BigEndian.PutUint64(u[8:], n.Lo) - return -} - -type uint128 struct { - Lo, Hi uint64 -} - -func (u uint128) quoRem64(v uint64) (q uint128, r uint64) { - q.Hi, r = bits.Div64(0, u.Hi, v) - q.Lo, r = bits.Div64(r, u.Lo, v) - return -} - -func (u uint128) mulAdd64(m uint64, a uint64) (uint128, error) { - hi, lo := bits.Mul64(u.Lo, m) - p0, p1 := bits.Mul64(u.Hi, m) - lo, c0 := bits.Add64(lo, a, 0) - hi, c1 := bits.Add64(hi, p1, c0) - if p0 != 0 || c1 != 0 { - return uint128{}, fmt.Errorf("number is out of range (need a 128-bit value)") - } - return uint128{lo, hi}, nil -} diff --git a/encoder.go b/encoder.go new file mode 100644 index 0000000..6990c29 --- /dev/null +++ b/encoder.go @@ -0,0 +1,158 @@ +package shortuuid + +import ( + "encoding/binary" + "fmt" + "github.com/google/uuid" + "math" + "math/bits" + "strings" +) + +type encoder struct { + // alphabet is the character set to construct the UUID from. + alphabet alphabet +} + +const ( + defaultBase = 57 + defaultEncLen = 22 + defaultNDigits = 10 + defaultDivisor = 362033331456891249 // 57^10 +) + +func maxPow(b uint64) (d uint64, n int) { + d, n = b, 1 + for m := math.MaxUint64 / b; d <= m; { + d *= b + n++ + } + return +} + +// Encode encodes uuid.UUID into a string using the most significant bits (MSB) +// first according to the alphabet. +func (e encoder) Encode(u uuid.UUID) string { + if e.alphabet.singleBytes { + return e.encodeSingleBytes(u) + } + return e.encode(u) +} + +func (e encoder) encodeSingleBytes(u uuid.UUID) string { + num := uint128{ + binary.BigEndian.Uint64(u[8:]), + binary.BigEndian.Uint64(u[:8]), + } + var r uint64 + var i int + var buf []byte + if e.alphabet.len == defaultBase { // compiler optimizations using constants for default base + buf = make([]byte, defaultEncLen) + for i = defaultEncLen - 1; num.Hi > 0 || num.Lo > 0; { + num, r = num.quoRem64(defaultDivisor) + for j := 0; j < defaultNDigits && i >= 0; j++ { + buf[i] = byte(e.alphabet.chars[r%defaultBase]) + r /= defaultBase + i-- + } + } + } else { + buf = make([]byte, e.alphabet.encLen) + l := uint64(e.alphabet.len) + d, n := maxPow(l) + for i = int(e.alphabet.encLen - 1); num.Hi > 0 || num.Lo > 0; { + num, r = num.quoRem64(d) + for j := 0; j < n && i >= 0; j++ { + buf[i] = byte(e.alphabet.chars[r%l]) + r /= l + i-- + } + } + } + for ; i >= 0; i-- { + buf[i] = byte(e.alphabet.chars[0]) + } + return string(buf[:]) +} + +func (e encoder) encode(u uuid.UUID) string { + num := uint128{ + binary.BigEndian.Uint64(u[8:]), + binary.BigEndian.Uint64(u[:8]), + } + var r uint64 + var outIndexes []uint64 + if e.alphabet.len == defaultBase { // compiler optimizations using constants for default base + outIndexes = make([]uint64, defaultEncLen) // avoids escaping to heap for base57 when used with constant + for i := defaultEncLen - 1; num.Hi > 0 || num.Lo > 0; { + num, r = num.quoRem64(defaultDivisor) + for j := 0; j < defaultNDigits && i >= 0; j++ { + outIndexes[i] = r % defaultBase + r /= defaultBase + i-- + } + } + } else { + outIndexes = make([]uint64, e.alphabet.encLen) + l := uint64(e.alphabet.len) + d, n := maxPow(l) + for i := int(e.alphabet.encLen - 1); num.Hi > 0 || num.Lo > 0; { + num, r = num.quoRem64(d) + for j := 0; j < n && i >= 0; j++ { + outIndexes[i] = r % l + r /= l + i-- + } + } + } + + var sb strings.Builder + sb.Grow(int(e.alphabet.encLen)) + for i := 0; i < int(e.alphabet.encLen); i++ { + sb.WriteRune(e.alphabet.chars[outIndexes[i]]) + } + return sb.String() +} + +// Decode decodes a string according to the alphabet into a uuid.UUID. If s is +// too short, its most significant bits (MSB) will be padded with 0 (zero). +func (e encoder) Decode(s string) (u uuid.UUID, err error) { + var n uint128 + var index int64 + + for _, char := range s { + index, err = e.alphabet.Index(char) + if err != nil { + return + } + n, err = n.mulAdd64(uint64(e.alphabet.len), uint64(index)) + if err != nil { + return + } + } + binary.BigEndian.PutUint64(u[:8], n.Hi) + binary.BigEndian.PutUint64(u[8:], n.Lo) + return +} + +type uint128 struct { + Lo, Hi uint64 +} + +func (u uint128) quoRem64(v uint64) (q uint128, r uint64) { + q.Hi, r = bits.Div64(0, u.Hi, v) + q.Lo, r = bits.Div64(r, u.Lo, v) + return +} + +func (u uint128) mulAdd64(m uint64, a uint64) (uint128, error) { + hi, lo := bits.Mul64(u.Lo, m) + p0, p1 := bits.Mul64(u.Hi, m) + lo, c0 := bits.Add64(lo, a, 0) + hi, c1 := bits.Add64(hi, p1, c0) + if p0 != 0 || c1 != 0 { + return uint128{}, fmt.Errorf("number is out of range (need a 128-bit value)") + } + return uint128{lo, hi}, nil +} diff --git a/go.mod b/go.mod index 15e6163..be9e661 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,4 @@ module github.com/lithammer/shortuuid/v4 require github.com/google/uuid v1.6.0 -go 1.13 +go 1.21 diff --git a/shortuuid.go b/shortuuid.go index 7ae1ef5..0c51b9a 100644 --- a/shortuuid.go +++ b/shortuuid.go @@ -8,7 +8,7 @@ import ( // DefaultEncoder is the default encoder uses when generating new UUIDs, and is // based on Base57. -var DefaultEncoder = &base57{newAlphabet(DefaultAlphabet)} +var DefaultEncoder = &encoder{newAlphabet(DefaultAlphabet)} // Encoder is an interface for encoding/decoding UUIDs to strings. type Encoder interface { @@ -33,9 +33,9 @@ func NewWithNamespace(name string) string { switch { case name == "": u = uuid.New() - case strings.HasPrefix(strings.ToLower(name), "http://"): + case hasPrefixCaseInsensitive(name, "https://"): u = uuid.NewSHA1(uuid.NameSpaceURL, []byte(name)) - case strings.HasPrefix(strings.ToLower(name), "https://"): + case hasPrefixCaseInsensitive(name, "http://"): u = uuid.NewSHA1(uuid.NameSpaceURL, []byte(name)) default: u = uuid.NewSHA1(uuid.NameSpaceDNS, []byte(name)) @@ -47,6 +47,10 @@ func NewWithNamespace(name string) string { // NewWithAlphabet returns a new UUIDv4, encoded with base57 using the // alternative alphabet abc. func NewWithAlphabet(abc string) string { - enc := base57{newAlphabet(abc)} + enc := encoder{newAlphabet(abc)} return enc.Encode(uuid.New()) } + +func hasPrefixCaseInsensitive(s, prefix string) bool { + return len(s) >= len(prefix) && strings.EqualFold(s[:len(prefix)], prefix) +} diff --git a/shortuuid_test.go b/shortuuid_test.go index d5f0b71..1ff900d 100644 --- a/shortuuid_test.go +++ b/shortuuid_test.go @@ -201,7 +201,7 @@ func TestDecodingErrors(t *testing.T) { func TestNewWithAlphabet(t *testing.T) { abc := DefaultAlphabet[:len(DefaultAlphabet)-1] + "=" - enc := base57{newAlphabet(abc)} + enc := encoder{newAlphabet(abc)} u1, _ := uuid.Parse("e9ae9ba7-4fb1-4a6d-bbca-5315ed438371") u2 := enc.Encode(u1) if u2 != "iZsai==fWebXd5rLRWFB=u" { @@ -209,6 +209,56 @@ func TestNewWithAlphabet(t *testing.T) { } } +func TestNewWithAlphabet_MultipleBytes(t *testing.T) { + abc := DefaultAlphabet[:len(DefaultAlphabet)-2] + "おネ" + enc := encoder{newAlphabet(abc)} + u1, _ := uuid.Parse("e9ae9ba7-4fb1-4a6d-bbca-5315ed438374") + u2 := enc.Encode(u1) + if u2 != "jatbjAAgXfcYe5sMSXGCAお" { + t.Errorf("expected uuid to be %q, got %q", "jatbjAAgXfcYe5sMSXGCAお", u2) + } +} + +func TestAlphabetCustomLen(t *testing.T) { + abc := "21345687654123456" + enc := encoder{newAlphabet(abc)} + u1, _ := uuid.Parse("13ef31aa-934b-4f37-93b3-6e3ef30148e2") + exp := "1348474176355756628268227744454847411355453" + u2 := enc.Encode(u1) + if u2 != exp { + t.Errorf("expected uuid to be %q, got %q", exp, u2) + return + } + u3, err := enc.Decode(u2) + if err != nil { + t.Error(err) + return + } + if u1 != u3 { + t.Errorf("expected %q, got %q", u1, u3) + } +} + +func TestAlphabet_MB(t *testing.T) { + abc := "うえおなにぬねのウエオナニヌネノ" + enc := encoder{newAlphabet(abc)} + u1, _ := uuid.Parse("13ef31aa-934b-4f37-93b3-6e3ef30148e2") + exp := "えなネノなえオオエなにナにノなのエなナなねネなネノなうえにウネお" + u2 := enc.Encode(u1) + if u2 != exp { + t.Errorf("expected uuid to be %q, got %q", exp, u2) + return + } + u3, err := enc.Decode(u2) + if err != nil { + t.Error(err) + return + } + if u1 != u3 { + t.Errorf("expected %q, got %q", u1, u3) + } +} + func BenchmarkUUID(b *testing.B) { for i := 0; i < b.N; i++ { New() @@ -222,8 +272,82 @@ func BenchmarkEncoding(b *testing.B) { } } +func BenchmarkEncodingB57_MB(b *testing.B) { + u := uuid.New() + enc := encoder{alphabet: newAlphabet("23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghiうえおなにぬねのウエオナニヌネノ")} + for i := 0; i < b.N; i++ { + enc.Encode(u) + } +} + +func BenchmarkEncodingB16(b *testing.B) { + u := uuid.New() + enc := encoder{alphabet: newAlphabet("0123456789abcdef")} + for i := 0; i < b.N; i++ { + enc.Encode(u) + } +} + +func BenchmarkEncodingB16_MB(b *testing.B) { + u := uuid.New() + enc := encoder{alphabet: newAlphabet("うえおなにぬねのウエオナニヌネノ")} + for i := 0; i < b.N; i++ { + enc.Encode(u) + } +} + func BenchmarkDecoding(b *testing.B) { for i := 0; i < b.N; i++ { _, _ = DefaultEncoder.Decode("nUfojcH2M5j9j3Tk5A8mf7") } } + +func BenchmarkDecodingB16(b *testing.B) { + enc := encoder{alphabet: newAlphabet("0123456789abcdef")} + for i := 0; i < b.N; i++ { + _, _ = enc.Decode("b430e18862a84ec58068d03898d94f5f") + } +} + +func BenchmarkDecodingB16_MB(b *testing.B) { + enc := encoder{alphabet: newAlphabet("うえおなにぬねのウエオナニヌネノ")} + for i := 0; i < b.N; i++ { + _, _ = enc.Decode("えなネノなえオオエなにナにノなのエなナなねネなネノなうえにウネお") + } +} + +func BenchmarkNewWithAlphabet(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithAlphabet("23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxy!") + } +} + +func BenchmarkNewWithAlphabetB16(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithAlphabet("0123456789abcdef") + } +} + +func BenchmarkNewWithAlphabetB16_MB(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithAlphabet("うえおなにぬねのウエオナニヌネノ") + } +} + +func BenchmarkNewWithNamespace(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithNamespace("someaveragelengthurl") + } +} + +func BenchmarkNewWithNamespaceHttp(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithNamespace("http://someaveragelengthurl.test") + } +} + +func BenchmarkNewWithNamespaceHttps(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = NewWithNamespace("https://someaveragelengthurl.test") + } +}