From cfaa5873bfeabd347413facf46191c60ab0e91f8 Mon Sep 17 00:00:00 2001 From: "jianchao.ma" Date: Tue, 23 Apr 2024 15:15:31 +0000 Subject: [PATCH] feat:return error when a null value in dataset --- cmd/sigo/main.go | 4 +- internal/infra/source.go | 33 ++++++++-------- pkg/sigo/anonymizer.go | 43 ++++++++++++++------ pkg/sigo/driver.go | 9 +++-- pkg/sigo/driver_test.go | 24 +++++++++++ pkg/sigo/info.go | 2 +- pkg/sigo/kdtree.go | 66 +++++++++++++++++++++++-------- pkg/sigo/model.go | 4 +- test/suites/04-run-anonymizer.yml | 6 +-- 9 files changed, 135 insertions(+), 56 deletions(-) diff --git a/cmd/sigo/main.go b/cmd/sigo/main.go index 5a6fb7f..ffd28ef 100644 --- a/cmd/sigo/main.go +++ b/cmd/sigo/main.go @@ -175,7 +175,9 @@ func run(definition pdef, logs logs) { err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), definition.k, definition.l, len(definition.qi), newAnonymizer(definition.method, definition.args), sink, debugger) if err != nil { - panic(err) + log.Err(err).Msg("Anonymize failed") + log.Warn().Msg("End SIGO") + os.Exit(1) } if logs.profiling { diff --git a/internal/infra/source.go b/internal/infra/source.go index e0e71b3..45d9f76 100644 --- a/internal/infra/source.go +++ b/internal/infra/source.go @@ -19,9 +19,7 @@ package infra import ( "errors" - "fmt" "io" - "strconv" "github.com/cgi-fr/jsonline/pkg/jsonline" "github.com/cgi-fr/sigo/pkg/sigo" @@ -45,25 +43,26 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) { if value == nil { //nolint: goerr113 err := errors.New("null value in dataset") - return []float64{}, err - } - switch v := value.(type) { - case float64: - result = append(result, v) - case int: - result = append(result, float64(v)) - case string: - floatValue, err := strconv.ParseFloat(v, 64) - if err != nil { - return []float64{}, err - } - result = append(result, floatValue) - default: - err := fmt.Errorf("unsupported type: %T", v) return []float64{}, err } + result = append(result, (*jlr.row).GetFloat64(key)) + + // switch v := value.(type) { + // case float64: + // result = append(result, v) + // case json.Number: + // floatValue, err := v.Float64() + // if err != nil { + // return []float64{}, err + // } + // result = append(result, floatValue) + // default: + // err := fmt.Errorf("unsupported type: %T", v) + // return []float64{}, err + // } + } return result, nil diff --git a/pkg/sigo/anonymizer.go b/pkg/sigo/anonymizer.go index 40ea218..37fa90f 100644 --- a/pkg/sigo/anonymizer.go +++ b/pkg/sigo/anonymizer.go @@ -18,6 +18,7 @@ package sigo import ( + "fmt" "os" "github.com/cgi-fr/jsonline/pkg/cast" @@ -97,7 +98,7 @@ type ( } ) -func (ar AnonymizedRecord) QuasiIdentifer() []float64 { +func (ar AnonymizedRecord) QuasiIdentifer() ([]float64, error) { return ar.original.QuasiIdentifer() } @@ -142,7 +143,7 @@ func (a GeneralAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) R // ComputeGeneralization calculates the min and max values of the cluster for each qi. func (a GeneralAnonymizer) ComputeGeneralization(clus Cluster, qi []string) { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) boundsVal := make(map[string]bounds) @@ -175,7 +176,7 @@ func (a AggregationAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []strin // ComputeAggregation calculates the mean (method meanAggreagtion) // or median (method medianAggregation) value of the cluster for each qi. func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) valAggregation := make(map[string]float64) @@ -196,7 +197,7 @@ func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) { // if the record is > Q3 then it takes the Q3 value // if the record is < Q1 then it takes the Q1 value. func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { @@ -205,7 +206,12 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re bottom := q.Q1 top := q.Q3 - val := rec.QuasiIdentifer()[i] + recVals, err := rec.QuasiIdentifer() + if err != nil { + fmt.Println(err) + break + } + val := recVals[i] switch { case val < bottom: @@ -224,11 +230,16 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re // the record takes as value the original value added to a Laplacian or Gaussian noise // the anonymized value stays within the bounds of the cluster. func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record { - values := listValues(clus, qi) + values, _ := listValues(clus, qi) mask := map[string]interface{}{} for i, key := range qi { - val := rec.QuasiIdentifer()[i] + recVals, err := rec.QuasiIdentifer() + if err != nil { + fmt.Println(err) + break + } + val := recVals[i] laplaceVal := Scaling(val, values[key], laplace) gaussianVal := Scaling(val, values[key], gaussian) @@ -281,7 +292,8 @@ func (a SwapAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Reco func (a SwapAnonymizer) Swap(clus Cluster, qi []string) { // retrieve the cluster values for each qi - values := listValues(clus, qi) + values, _ := listValues(clus, qi) + swapVal := make(map[string][]float64) for _, key := range qi { @@ -416,7 +428,8 @@ func (r Reidentification) Statistics(idCluster string, q string) (mean float64, // ComputeSimilarity computes the similarity score between the record rec and the anonymized cluster data. func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, - qi []string, s []string) map[float64]interface{} { + qi []string, s []string, +) map[float64]interface{} { scores := make(map[float64]interface{}) x := make(map[string]interface{}) @@ -448,14 +461,20 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster, } // Returns the list of values present in the cluster for each qi. -func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) { +func listValues(clus Cluster, qi []string) (mapValues map[string][]float64, err error) { mapValues = make(map[string][]float64) for _, record := range clus.Records() { for i, key := range qi { - mapValues[key] = append(mapValues[key], record.QuasiIdentifer()[i]) + vals, _ := record.QuasiIdentifer() + // if err != nil { + // fmt.Println(err) + // return map[string][]float64{}, err + // } + val := vals[i] + mapValues[key] = append(mapValues[key], val) } } - return mapValues + return mapValues, nil } diff --git a/pkg/sigo/driver.go b/pkg/sigo/driver.go index 214e9ba..99538c0 100644 --- a/pkg/sigo/driver.go +++ b/pkg/sigo/driver.go @@ -24,12 +24,12 @@ import ( ) func Anonymize(source RecordSource, factory GeneralizerFactory, - k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error { + k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger, +) error { generalizer := factory.New(k, l, dim, source.QuasiIdentifer()) count := 0 log.Info().Msg("Reading source") - for source.Next() { if source.Err() != nil { return fmt.Errorf("%w", source.Err()) @@ -42,7 +42,10 @@ func Anonymize(source RecordSource, factory GeneralizerFactory, log.Info().Msgf("%v individuals to anonymize", count) log.Info().Msg("Tree building") - generalizer.Build() + err := generalizer.Build() + if err != nil { + return err + } log.Info().Msg("Cluster Anonymization") diff --git a/pkg/sigo/driver_test.go b/pkg/sigo/driver_test.go index 0be482a..fa717da 100644 --- a/pkg/sigo/driver_test.go +++ b/pkg/sigo/driver_test.go @@ -144,3 +144,27 @@ func BenchmarkLongClustering(b *testing.B) { jsonBytes.Close() } } + +func TestNullValueShouldReturnError(t *testing.T) { + t.Parallel() + + // nolint: goconst + sourceText := `{"x":0, "y":0, "foo":"bar"} + {"x":null, "y":1, "foo":"bar"} + {"x":0, "y":null, "foo":"bar"} + {"x":2, "y":1, "foo":"null"} + {"x":3, "y":2, "foo":"baz"} + {"x":2, "y":3, "foo":"baz"}` + + expectedMessage := "null value in dataset" + + source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"}) + assert.Nil(t, err) + + result := []map[string]interface{}{} + sink := infra.NewSliceDictionariesSink(&result) + err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink, + sigo.NewSequenceDebugger("clusterID")) + assert.NotNil(t, err) + assert.Equal(t, expectedMessage, err.Error()) +} diff --git a/pkg/sigo/info.go b/pkg/sigo/info.go index b3b9735..e86b33c 100644 --- a/pkg/sigo/info.go +++ b/pkg/sigo/info.go @@ -29,7 +29,7 @@ type InfosRecord struct { infos map[string]interface{} } -func (ir InfosRecord) QuasiIdentifer() []float64 { +func (ir InfosRecord) QuasiIdentifer() ([]float64, error) { return ir.original.QuasiIdentifer() } diff --git a/pkg/sigo/kdtree.go b/pkg/sigo/kdtree.go index a866e63..f79e189 100644 --- a/pkg/sigo/kdtree.go +++ b/pkg/sigo/kdtree.go @@ -63,8 +63,8 @@ func (t KDTree) Add(r Record) { } // Build starts building the tree. -func (t KDTree) Build() { - t.root.build() +func (t KDTree) Build() error { + return t.root.build() } // Clusters returns the list of clusters in the tree. @@ -77,7 +77,7 @@ func (t KDTree) String() string { return t.root.string(0) } -//nolint: revive, golint +// nolint: revive, golint func NewNode(tree *KDTree, path string, rot int) node { return node{ tree: tree, @@ -111,7 +111,7 @@ func (n *node) incRot() { } // build creates nodes. -func (n *node) build() { +func (n *node) build() error { log.Debug(). Str("Dimension", n.tree.qi[n.rot]). Str("Path", n.clusterPath). @@ -123,11 +123,14 @@ func (n *node) build() { var ( lower, upper node valide bool + err error ) for i := 1; i <= n.tree.dim; i++ { - lower, upper, valide = n.split() - if !valide { + lower, upper, valide, err = n.split() + if err != nil { + return err + } else if !valide { n.incRot() } else { break @@ -135,7 +138,7 @@ func (n *node) build() { } if !valide { - return + return nil } lower.validate() @@ -147,17 +150,43 @@ func (n *node) build() { } n.cluster = nil - n.subNodes[0].build() - n.subNodes[1].build() + err = n.subNodes[0].build() + if err != nil { + return err + } + err = n.subNodes[1].build() + if err != nil { + return err + } } + + return nil } // split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts // and all elements having the same value in the same subnode. -func (n *node) split() (node, node, bool) { - sort.SliceStable(n.cluster, func(i int, j int) bool { - return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot] - }) +func (n *node) split() (node, node, bool, error) { + var globalError error + + less := func(i, j int) bool { + valueI, err := n.cluster[i].QuasiIdentifer() + if err != nil { + // Stocker l'erreur dans la variable globale + globalError = err + return false + } + valueJ, err := n.cluster[j].QuasiIdentifer() + if err != nil { + globalError = err + return false + } + return valueI[n.rot] < valueJ[n.rot] + } + + sort.SliceStable(n.cluster, less) + if globalError != nil { + return node{}, node{}, false, globalError + } n.pivot = nil lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1) @@ -168,21 +197,23 @@ func (n *node) split() (node, node, bool) { previous := n.cluster[0] for _, row := range n.cluster { + rowValue, _ := row.QuasiIdentifer() + previousValue, _ := previous.QuasiIdentifer() // equal subnodes and all elements having the same value in the same subnode - if lowerSize < len(n.cluster)/2 || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] { + if lowerSize < len(n.cluster)/2 || rowValue[n.rot] == previousValue[n.rot] { lower.Add(row) previous = row lowerSize++ } else { if n.pivot == nil { - n.pivot = row.QuasiIdentifer() + n.pivot = rowValue } upper.Add(row) upperSize++ } } - return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv() + return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv(), nil } // Records returns the list of records in the node. @@ -214,7 +245,8 @@ func (n *node) string(offset int) string { result := "[" for _, rec := range n.cluster { // result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot]) - result += fmt.Sprintf("%v ", rec.QuasiIdentifer()) + recValue, _ := rec.QuasiIdentifer() + result += fmt.Sprintf("%v ", recValue) } result += "]" diff --git a/pkg/sigo/model.go b/pkg/sigo/model.go index b0cc461..9a9c491 100644 --- a/pkg/sigo/model.go +++ b/pkg/sigo/model.go @@ -30,7 +30,7 @@ type RecordSink interface { } type Record interface { - QuasiIdentifer() []float64 + QuasiIdentifer() ([]float64, error) Sensitives() []interface{} Row() map[string]interface{} } @@ -44,7 +44,7 @@ type Generalizer interface { Add(Record) Clusters() []Cluster String() string - Build() + Build() error } type GeneralizerFactory interface { diff --git a/test/suites/04-run-anonymizer.yml b/test/suites/04-run-anonymizer.yml index 1ccab1f..8e9e42f 100644 --- a/test/suites/04-run-anonymizer.yml +++ b/test/suites/04-run-anonymizer.yml @@ -57,7 +57,7 @@ testcases: - script: rm -f output_sigo.json - script: rm -f output_jq.json - script: |- - sigo -q taille,poids,fruit,natation,course,voltige -s meurtre -k 2 -l 2 -a general > output_sigo.json < output_sigo.json <