Skip to content

Commit

Permalink
wisdom of obfuscation
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Aug 19, 2024
1 parent 346e68c commit 6ecaa4d
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 445 deletions.
269 changes: 0 additions & 269 deletions benchmark.txt

This file was deleted.

13 changes: 4 additions & 9 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e
set -x

Expand All @@ -10,17 +10,12 @@ if [ -z "${VIRTUAL_ENV}" ]; then
echo "Virtual environment has been not activated"
if ! [ -d "${THISDIR}/${VENVDIR}" ]; then
echo "Create new virtual environment"
python3.8 -m virtualenv -v --copies "${THISDIR}/${VENVDIR}"
python3.10 -m virtualenv -v --copies "${THISDIR}/${VENVDIR}"
fi
fi

if [ -z "${VIRTUAL_ENV}" ]; then
. "${THISDIR}/${VENVDIR}/bin/activate"
fi

if ! pip list | grep PyYAML; then
pip install PyYAML
fi

python download_data.py --data_dir data

python download_data.py --clean_data --jobs $(nproc)
37 changes: 36 additions & 1 deletion download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import string
import subprocess
import sys
from argparse import ArgumentParser, Namespace
from argparse import Namespace, ArgumentParser
from multiprocessing import Pool
from typing import List

Expand Down Expand Up @@ -347,7 +347,42 @@ def get_obfuscated_value(value, meta_row: MetaRow):
return obfuscated_value


def check_asc_or_desc(line_data_value: str) -> bool:
"""ValuePatternCheck as example"""
count_asc = 1
count_desc = 1
for i in range(len(line_data_value) - 1):
if line_data_value[i] in string.ascii_letters + string.digits \
and ord(line_data_value[i + 1]) - ord(line_data_value[i]) == 1:
count_asc += 1
if 4 == count_asc:
return True
else:
count_asc = 1
if line_data_value[i] in string.ascii_letters + string.digits \
and ord(line_data_value[i]) - ord(line_data_value[i + 1]) == 1:
count_desc += 1
if 4 == count_desc:
return True
else:
count_desc = 1
continue
return False

def generate_value(value):
"""Wrapper to skip obfuscation with false positive or negatives"""
pattern_keyword = re.compile(r"(api|pass|pw[d\b])", flags=re.IGNORECASE)
pattern_similar = re.compile(r"(\w)\1{3,}")
new_value = None
while new_value is None \
or pattern_keyword.findall(new_value) \
or pattern_similar.findall(new_value) \
or check_asc_or_desc(new_value):
new_value = gen_random_value(value)
return new_value


def gen_random_value(value):
obfuscated_value = ""

digits_set = string.digits
Expand Down
8 changes: 4 additions & 4 deletions meta/2ba83c6a.csv
Original file line number Diff line number Diff line change
Expand Up @@ -595,10 +595,10 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
24213,911dde03,GitHub,2ba83c6a,data/2ba83c6a/test/911dde03.txt,22033,22033,T,F,6,70,F,F,,,,,0.0,0,F,F,F,Key
24214,8a68cd28,GitHub,2ba83c6a,data/2ba83c6a/test/8a68cd28.txt,119,119,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24215,5e763eae,GitHub,2ba83c6a,data/2ba83c6a/test/5e763eae.txt,195,195,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24216,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,250,250,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24216,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,250,250,F,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24217,4acf8d32,GitHub,2ba83c6a,data/2ba83c6a/test/4acf8d32.txt,37,37,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key:Bitbucket Client Secret:Bitbucket Client ID
24218,556bad09,GitHub,2ba83c6a,data/2ba83c6a/test/556bad09.txt,525,525,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24219,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,160,160,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24219,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,160,160,F,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24220,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,166,166,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24221,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,172,172,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
24222,92404dee,GitHub,2ba83c6a,data/2ba83c6a/test/92404dee.txt,178,178,T,F,6,38,F,F,,,,,0.0,0,F,F,F,Key
Expand Down Expand Up @@ -923,8 +923,8 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
27342,eb0705d8,GitHub,2ba83c6a,data/2ba83c6a/other/eb0705d8.pod,460,460,T,F,23,39,F,F,Any,,,Secret,3.88,16,F,F,F,Key
138189,eb0705d8,GitHub,2ba83c6a,data/2ba83c6a/other/eb0705d8.pod,460,460,F,F,54,-1,F,F,,,,,0.0,0,F,F,F,Other
27386,dc676919,GitHub,2ba83c6a,data/2ba83c6a/test/dc676919.txt,1,1,F,F,32,37,F,F,,,,,0,0,F,F,F,Secret
27407,6cfa362d,GitHub,2ba83c6a,data/2ba83c6a/src/6cfa362d.cnf,344,344,Template,T,9,19,F,F,Any,,,Secret,2.31,10,F,F,F,Password:Secret
27425,23d50951,GitHub,2ba83c6a,data/2ba83c6a/src/23d50951.cnf,358,358,T,T,9,23,F,F,Any,,,Secret,2.52,14,F,F,F,Password:Secret
27407,6cfa362d,GitHub,2ba83c6a,data/2ba83c6a/src/6cfa362d.cnf,344,344,T,T,9,19,F,F,Any,,,Secret,2.31,10,F,F,F,Secret
27425,23d50951,GitHub,2ba83c6a,data/2ba83c6a/src/23d50951.cnf,358,358,F,T,9,23,F,F,Any,,,Secret,2.52,14,F,F,F,Password:Secret
27489,a3cf7bc8,GitHub,2ba83c6a,data/2ba83c6a/test/a3cf7bc8.txt,298,298,F,F,6,24,F,F,,,,,0.0,0,F,F,F,Key
27490,a3cf7bc8,GitHub,2ba83c6a,data/2ba83c6a/test/a3cf7bc8.txt,282,282,F,F,6,14,F,F,,,,,0.0,0,F,F,F,Key
27491,b40503ed,GitHub,2ba83c6a,data/2ba83c6a/test/b40503ed.txt,154,154,F,F,6,11,F,F,,,,,0.0,0,F,F,F,Key
Expand Down
3 changes: 1 addition & 2 deletions meta/60f9915d.csv
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
32158,a414e92e,GitHub,60f9915d,data/60f9915d/test/a414e92e.go,300,300,T,T,67,74,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials
32159,7e47b56e,GitHub,60f9915d,data/60f9915d/test/7e47b56e.go,14,14,T,T,79,86,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials
32162,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,206,206,T,T,49,56,T,F,CharsOnly,,,Secret,2.81,7,F,F,F,URL Credentials
32163,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,T,T,54,70,T,F,Any,,,Secret,3.25,16,F,F,F,URL Credentials
32163,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,F,T,,,T,F,Any,,,Secret,3.25,16,F,F,F,URL Credentials:Password
35875,e45e45ba,GitHub,60f9915d,data/60f9915d/src/e45e45ba.yml,48,48,T,T,25,32,F,F,CharsOnly,,,Secret,2.81,7,F,F,F,Password
46090,35a99f76,GitHub,60f9915d,data/60f9915d/test/35a99f76.go,79,79,Template,T,18,24,F,F,CharsOnly,,,Secret,2.25,6,F,F,F,Password
52007,3d9a9f38,GitHub,60f9915d,data/60f9915d/test/3d9a9f38.go,639,639,T,F,53,60,F,F,,,,,0.0,0,F,F,F,URL Credentials
Expand Down Expand Up @@ -122,4 +122,3 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
137948,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,T,F,86,93,F,F,,,,,0.0,0,F,F,F,Password:URL Credentials
138239,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,F,F,65,-1,F,F,,,,,0.0,0,F,F,F,Other
138240,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,161,161,F,F,104,-1,F,F,,,,,0.0,0,F,F,F,Other
138241,4dc56e64,GitHub,60f9915d,data/60f9915d/test/4dc56e64.go,219,219,F,F,81,85,F,F,,,,,0.0,0,F,F,F,Password
6 changes: 6 additions & 0 deletions meta/69d49010.csv
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,9 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
133411,24cbbb32,GitHub,69d49010,data/69d49010/test/24cbbb32.py,680,680,F,F,,,F,F,,,,,0,0,F,F,F,Secret
135283,85d52436,GitHub,69d49010,data/69d49010/test/85d52436.py,279,279,T,F,20,33,F,F,,,,,0.0,0,F,F,F,Password:URL Credentials
1341461,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,438,438,T,F,17,53,F,F,,,,,0.0,0,F,F,F,UUID:Token
1479347,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,258,258,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth
1479349,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,277,277,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth
1479352,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,303,303,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth
1479354,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,329,329,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth
1479356,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,356,356,T,F,35,48,F,F,,,,,0.0,0,F,F,F,Auth
1479361,5aad918a,GitHub,69d49010,data/69d49010/test/5aad918a.py,433,433,T,F,40,51,F,F,,,,,0.0,0,F,F,F,Auth
1 change: 1 addition & 0 deletions meta/75e7c64d.csv
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
114451,cb047af3,GitHub,75e7c64d,data/75e7c64d/src/cb047af3.yml,39,39,F,F,-1,-1,F,F,,,,,0.0,-1,F,F,F,Key
114452,2a39f30f,GitHub,75e7c64d,data/75e7c64d/src/2a39f30f.yml,37,37,F,F,-1,-1,F,F,,,,,0.0,-1,F,F,F,Key
133463,841e3aef,GitHub,75e7c64d,data/75e7c64d/src/841e3aef.py,329,329,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key
1479362,0f6303ab,GitHub,75e7c64d,data/75e7c64d/src/0f6303ab.py,101,101,F,F,-1,-1,F,F,,,,,0.0,0,F,F,F,Key
1 change: 1 addition & 0 deletions meta/8cda00f3.csv
Original file line number Diff line number Diff line change
Expand Up @@ -913,3 +913,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,WithWords,Value
135313,eaf18c55,GitHub,8cda00f3,data/8cda00f3/other/eaf18c55.md,209,209,T,F,21,37,F,F,,,,,0,0,F,F,F,Auth
135314,eaf18c55,GitHub,8cda00f3,data/8cda00f3/other/eaf18c55.md,300,300,T,F,21,37,F,F,,,,,0,0,F,F,F,Auth
1340770,4cf2897e,GitHub,8cda00f3,data/8cda00f3/src/4cf2897e.go,16,16,T,F,22,58,F,F,,,,,0.0,0,F,F,F,UUID
1479363,0a7921b3,GitHub,8cda00f3,data/8cda00f3/test/0a7921b3.go,155,155,F,F,,,F,F,,,,,0.0,0,F,F,F,Auth
3 changes: 2 additions & 1 deletion meta_cred.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def __init__(self, cs_cred: dict):
if not self.path.startswith('data/'):
# license files ...
self.path = '/'.join([str(x) for x in path.parts[-3:]])
assert self.path.startswith('data/'), cs_cred # path for benchmark must start from data/
# path for benchmark must start from "data/"
assert self.path.startswith('data/'), cs_cred
self.valid_path = bool(self.valid_path_regex.match(self.path)) # to skip license files

self.line_start = line_data_list[0]["line_num"]
Expand Down
159 changes: 0 additions & 159 deletions update_meta.py

This file was deleted.

0 comments on commit 6ecaa4d

Please sign in to comment.