Skip to content

Commit

Permalink
feat:return error when a null value in dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Chao-Ma5566 committed Apr 23, 2024
1 parent e7f8a6c commit cfaa587
Show file tree
Hide file tree
Showing 9 changed files with 135 additions and 56 deletions.
4 changes: 3 additions & 1 deletion cmd/sigo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ func run(definition pdef, logs logs) {
err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), definition.k, definition.l,
len(definition.qi), newAnonymizer(definition.method, definition.args), sink, debugger)
if err != nil {
panic(err)
log.Err(err).Msg("Anonymize failed")
log.Warn().Msg("End SIGO")
os.Exit(1)
}

if logs.profiling {
Expand Down
33 changes: 16 additions & 17 deletions internal/infra/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ package infra

import (
"errors"
"fmt"
"io"
"strconv"

"github.com/cgi-fr/jsonline/pkg/jsonline"
"github.com/cgi-fr/sigo/pkg/sigo"
Expand All @@ -45,25 +43,26 @@ func (jlr JSONLineRecord) QuasiIdentifer() ([]float64, error) {
if value == nil {
//nolint: goerr113
err := errors.New("null value in dataset")
return []float64{}, err
}

switch v := value.(type) {
case float64:
result = append(result, v)
case int:
result = append(result, float64(v))
case string:
floatValue, err := strconv.ParseFloat(v, 64)
if err != nil {
return []float64{}, err
}
result = append(result, floatValue)
default:
err := fmt.Errorf("unsupported type: %T", v)
return []float64{}, err
}

result = append(result, (*jlr.row).GetFloat64(key))

// switch v := value.(type) {
// case float64:
// result = append(result, v)
// case json.Number:
// floatValue, err := v.Float64()
// if err != nil {
// return []float64{}, err
// }
// result = append(result, floatValue)
// default:
// err := fmt.Errorf("unsupported type: %T", v)
// return []float64{}, err
// }

}

return result, nil
Expand Down
43 changes: 31 additions & 12 deletions pkg/sigo/anonymizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package sigo

import (
"fmt"
"os"

"github.com/cgi-fr/jsonline/pkg/cast"
Expand Down Expand Up @@ -97,7 +98,7 @@ type (
}
)

func (ar AnonymizedRecord) QuasiIdentifer() []float64 {
func (ar AnonymizedRecord) QuasiIdentifer() ([]float64, error) {
return ar.original.QuasiIdentifer()
}

Expand Down Expand Up @@ -142,7 +143,7 @@ func (a GeneralAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) R

// ComputeGeneralization calculates the min and max values of the cluster for each qi.
func (a GeneralAnonymizer) ComputeGeneralization(clus Cluster, qi []string) {
values := listValues(clus, qi)
values, _ := listValues(clus, qi)

boundsVal := make(map[string]bounds)

Expand Down Expand Up @@ -175,7 +176,7 @@ func (a AggregationAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []strin
// ComputeAggregation calculates the mean (method meanAggreagtion)
// or median (method medianAggregation) value of the cluster for each qi.
func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) {
values := listValues(clus, qi)
values, _ := listValues(clus, qi)

valAggregation := make(map[string]float64)

Expand All @@ -196,7 +197,7 @@ func (a AggregationAnonymizer) ComputeAggregation(clus Cluster, qi []string) {
// if the record is > Q3 then it takes the Q3 value
// if the record is < Q1 then it takes the Q1 value.
func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record {
values := listValues(clus, qi)
values, _ := listValues(clus, qi)
mask := map[string]interface{}{}

for i, key := range qi {
Expand All @@ -205,7 +206,12 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re
bottom := q.Q1
top := q.Q3

val := rec.QuasiIdentifer()[i]
recVals, err := rec.QuasiIdentifer()
if err != nil {
fmt.Println(err)
break
}
val := recVals[i]

switch {
case val < bottom:
Expand All @@ -224,11 +230,16 @@ func (a CodingAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Re
// the record takes as value the original value added to a Laplacian or Gaussian noise
// the anonymized value stays within the bounds of the cluster.
func (a NoiseAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Record {
values := listValues(clus, qi)
values, _ := listValues(clus, qi)
mask := map[string]interface{}{}

for i, key := range qi {
val := rec.QuasiIdentifer()[i]
recVals, err := rec.QuasiIdentifer()
if err != nil {
fmt.Println(err)
break
}
val := recVals[i]

laplaceVal := Scaling(val, values[key], laplace)
gaussianVal := Scaling(val, values[key], gaussian)
Expand Down Expand Up @@ -281,7 +292,8 @@ func (a SwapAnonymizer) Anonymize(rec Record, clus Cluster, qi, s []string) Reco

func (a SwapAnonymizer) Swap(clus Cluster, qi []string) {
// retrieve the cluster values for each qi
values := listValues(clus, qi)
values, _ := listValues(clus, qi)

swapVal := make(map[string][]float64)

for _, key := range qi {
Expand Down Expand Up @@ -416,7 +428,8 @@ func (r Reidentification) Statistics(idCluster string, q string) (mean float64,

// ComputeSimilarity computes the similarity score between the record rec and the anonymized cluster data.
func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster,
qi []string, s []string) map[float64]interface{} {
qi []string, s []string,
) map[float64]interface{} {
scores := make(map[float64]interface{})

x := make(map[string]interface{})
Expand Down Expand Up @@ -448,14 +461,20 @@ func (r Reidentification) ComputeSimilarity(rec Record, clus Cluster,
}

// Returns the list of values present in the cluster for each qi.
func listValues(clus Cluster, qi []string) (mapValues map[string][]float64) {
func listValues(clus Cluster, qi []string) (mapValues map[string][]float64, err error) {
mapValues = make(map[string][]float64)

for _, record := range clus.Records() {
for i, key := range qi {
mapValues[key] = append(mapValues[key], record.QuasiIdentifer()[i])
vals, _ := record.QuasiIdentifer()
// if err != nil {
// fmt.Println(err)
// return map[string][]float64{}, err
// }
val := vals[i]
mapValues[key] = append(mapValues[key], val)
}
}

return mapValues
return mapValues, nil
}
9 changes: 6 additions & 3 deletions pkg/sigo/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ import (
)

func Anonymize(source RecordSource, factory GeneralizerFactory,
k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger) error {
k int, l int, dim int, anonymyzer Anonymizer, sink RecordSink, debugger Debugger,
) error {
generalizer := factory.New(k, l, dim, source.QuasiIdentifer())
count := 0

log.Info().Msg("Reading source")

for source.Next() {
if source.Err() != nil {
return fmt.Errorf("%w", source.Err())
Expand All @@ -42,7 +42,10 @@ func Anonymize(source RecordSource, factory GeneralizerFactory,
log.Info().Msgf("%v individuals to anonymize", count)
log.Info().Msg("Tree building")

generalizer.Build()
err := generalizer.Build()
if err != nil {
return err
}

log.Info().Msg("Cluster Anonymization")

Expand Down
24 changes: 24 additions & 0 deletions pkg/sigo/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,27 @@ func BenchmarkLongClustering(b *testing.B) {
jsonBytes.Close()
}
}

func TestNullValueShouldReturnError(t *testing.T) {
t.Parallel()

// nolint: goconst
sourceText := `{"x":0, "y":0, "foo":"bar"}
{"x":null, "y":1, "foo":"bar"}
{"x":0, "y":null, "foo":"bar"}
{"x":2, "y":1, "foo":"null"}
{"x":3, "y":2, "foo":"baz"}
{"x":2, "y":3, "foo":"baz"}`

expectedMessage := "null value in dataset"

source, err := infra.NewJSONLineSource(strings.NewReader(sourceText), []string{"x", "y"}, []string{"foo"})
assert.Nil(t, err)

result := []map[string]interface{}{}
sink := infra.NewSliceDictionariesSink(&result)
err = sigo.Anonymize(source, sigo.NewKDTreeFactory(), 2, 1, 2, sigo.NewNoAnonymizer(), sink,
sigo.NewSequenceDebugger("clusterID"))
assert.NotNil(t, err)
assert.Equal(t, expectedMessage, err.Error())
}
2 changes: 1 addition & 1 deletion pkg/sigo/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type InfosRecord struct {
infos map[string]interface{}
}

func (ir InfosRecord) QuasiIdentifer() []float64 {
func (ir InfosRecord) QuasiIdentifer() ([]float64, error) {
return ir.original.QuasiIdentifer()
}

Expand Down
66 changes: 49 additions & 17 deletions pkg/sigo/kdtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ func (t KDTree) Add(r Record) {
}

// Build starts building the tree.
func (t KDTree) Build() {
t.root.build()
func (t KDTree) Build() error {
return t.root.build()
}

// Clusters returns the list of clusters in the tree.
Expand All @@ -77,7 +77,7 @@ func (t KDTree) String() string {
return t.root.string(0)
}

//nolint: revive, golint
// nolint: revive, golint
func NewNode(tree *KDTree, path string, rot int) node {
return node{
tree: tree,
Expand Down Expand Up @@ -111,7 +111,7 @@ func (n *node) incRot() {
}

// build creates nodes.
func (n *node) build() {
func (n *node) build() error {
log.Debug().
Str("Dimension", n.tree.qi[n.rot]).
Str("Path", n.clusterPath).
Expand All @@ -123,19 +123,22 @@ func (n *node) build() {
var (
lower, upper node
valide bool
err error
)

for i := 1; i <= n.tree.dim; i++ {
lower, upper, valide = n.split()
if !valide {
lower, upper, valide, err = n.split()
if err != nil {
return err
} else if !valide {
n.incRot()
} else {
break
}
}

if !valide {
return
return nil
}

lower.validate()
Expand All @@ -147,17 +150,43 @@ func (n *node) build() {
}

n.cluster = nil
n.subNodes[0].build()
n.subNodes[1].build()
err = n.subNodes[0].build()
if err != nil {
return err
}
err = n.subNodes[1].build()
if err != nil {
return err
}
}

return nil
}

// split creates 2 subnodes by ordering the node and splitting in order to have 2 equal parts
// and all elements having the same value in the same subnode.
func (n *node) split() (node, node, bool) {
sort.SliceStable(n.cluster, func(i int, j int) bool {
return n.cluster[i].QuasiIdentifer()[n.rot] < n.cluster[j].QuasiIdentifer()[n.rot]
})
func (n *node) split() (node, node, bool, error) {
var globalError error

less := func(i, j int) bool {
valueI, err := n.cluster[i].QuasiIdentifer()
if err != nil {
// Stocker l'erreur dans la variable globale
globalError = err
return false
}
valueJ, err := n.cluster[j].QuasiIdentifer()
if err != nil {
globalError = err
return false
}
return valueI[n.rot] < valueJ[n.rot]
}

sort.SliceStable(n.cluster, less)
if globalError != nil {
return node{}, node{}, false, globalError
}

n.pivot = nil
lower := NewNode(n.tree, n.clusterPath+"-l", n.rot+1)
Expand All @@ -168,21 +197,23 @@ func (n *node) split() (node, node, bool) {
previous := n.cluster[0]

for _, row := range n.cluster {
rowValue, _ := row.QuasiIdentifer()
previousValue, _ := previous.QuasiIdentifer()
// equal subnodes and all elements having the same value in the same subnode
if lowerSize < len(n.cluster)/2 || row.QuasiIdentifer()[n.rot] == previous.QuasiIdentifer()[n.rot] {
if lowerSize < len(n.cluster)/2 || rowValue[n.rot] == previousValue[n.rot] {
lower.Add(row)
previous = row
lowerSize++
} else {
if n.pivot == nil {
n.pivot = row.QuasiIdentifer()
n.pivot = rowValue
}
upper.Add(row)
upperSize++
}
}

return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv()
return lower, upper, upperSize >= n.tree.k && lower.wellLDiv() && upper.wellLDiv(), nil
}

// Records returns the list of records in the node.
Expand Down Expand Up @@ -214,7 +245,8 @@ func (n *node) string(offset int) string {
result := "["
for _, rec := range n.cluster {
// result += fmt.Sprintf("%v ", rec.QuasiIdentifer()[n.rot])
result += fmt.Sprintf("%v ", rec.QuasiIdentifer())
recValue, _ := rec.QuasiIdentifer()
result += fmt.Sprintf("%v ", recValue)
}

result += "]"
Expand Down
Loading

0 comments on commit cfaa587

Please sign in to comment.