Skip to content

ML experiment maintenance #1708

ML experiment maintenance

ML experiment maintenance #1708

Workflow file for this run

# This workflow runs benchmark
# Separation of jobs helps to cache data even benchmark is fail
name: Benchmark
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
download_data:
runs-on: ubuntu-latest
steps:
- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData
- name: Cache data
id: cache-data
uses: actions/cache@v3
with:
path: data
key: cred-data-${{ hashFiles('snapshot.yaml') }}
- name: Set up Python 3.8
if: steps.cache-data.outputs.cache-hit != 'true'
uses: actions/setup-python@v3
with:
python-version: "3.8"
- name: Update PIP
run: python -m pip install --upgrade pip
- name: Install requirements of CredData
if: steps.cache-data.outputs.cache-hit != 'true'
run: python -m pip install --requirement requirements.txt
- name: Generate Data Asset
if: steps.cache-data.outputs.cache-hit != 'true'
run: python download_data.py --data_dir data
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
run_benchmark:
needs: [download_data]
runs-on: ubuntu-latest
steps:
- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData
- name: Cache data
id: cache-data
uses: actions/cache@v3
with:
path: data
key: cred-data-${{ hashFiles('snapshot.yaml') }}
- name: Failure in case when cache missed
if: steps.cache-data.outputs.cache-hit != 'true'
run: exit 1
- name: Check Data Asset - DEBUG
if: steps.cache-data.outputs.cache-hit == 'true'
run: ls -al . && ls -al data
- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.8"
- name: Update PIP
run: python -m pip install --upgrade pip
- name: Install requirements of CredData
run: python -m pip install --requirement requirements.txt
- name: Fix onnxruntime lib for released version 1.5.5 - todo remove it after new release
run: python -m pip install onnxruntime==1.15.1
- name: Checkout CredSweeper
if: ${{ 'pull_request' == github.event_name }}
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
path: temp/CredSweeper
- name: Patch benchmark for PR work
if: ${{ 'pull_request' == github.event_name }}
run: |
sed -i 's|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"|' benchmark/common/constants.py
grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py
- name: Run Benchmark
run: |
python -m benchmark --scanner credsweeper | tee credsweeper.log
- name: Get only results
run: |
head -n 12 credsweeper.log | tee benchmark.txt
tail -n 14 credsweeper.log | grep -v 'Time Elapsed:' | tee -a benchmark.txt
cp -vf ./temp/CredSweeper/output.json report.json
- name: Upload artifact
if: always()
uses: actions/upload-artifact@v3
with:
name: report
path: report.json
- name: Upload artifact
if: always()
uses: actions/upload-artifact@v3
with:
name: benchmark
path: benchmark.txt
- name: Verify benchmark scores of the PR
if: ${{ 'pull_request' == github.event_name }}
# update cicd/benchmark.txt with uploaded artifact if a difference is found
run: |
diff temp/CredSweeper/cicd/benchmark.txt benchmark.txt
- name: Checkout CredSweeper on push event
if: ${{ 'pull_request' != github.event_name }}
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
path: CredSweeper
- name: Verify benchmark scores on push event
if: ${{ 'pull_request' != github.event_name }}
# update cicd/benchmark.txt with uploaded artifact if a difference is found
run: |
diff CredSweeper/cicd/benchmark.txt benchmark.txt
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]
runs-on: ubuntu-latest
steps:
- name: Checkout CredData
uses: actions/checkout@v3
with:
repository: Samsung/CredData
- name: Cache data
id: cache-data
uses: actions/cache@v3
with:
path: data
key: cred-data-${{ hashFiles('snapshot.yaml') }}
- name: Failure in case when cache missed
if: steps.cache-data.outputs.cache-hit != 'true'
run: exit 1
- name: Exclude some sets and place to CredData dir
# keep b* & c* only to easy correct experiment/src/split.json
if: steps.cache-data.outputs.cache-hit == 'true'
run: |
rm -rf data/0* data/1* data/2* data/3* data/4* data/5* data/6* data/7* data/8* data/9* data/a* data/d* data/e* data/f*
rm -rf meta/0* meta/1* meta/2* meta/3* meta/4* meta/5* meta/6* meta/7* meta/8* meta/9* meta/a* meta/d* meta/e* meta/f*
mkdir -vp ${{ github.workspace }}/CredData
mv data ${{ github.workspace }}/CredData/
mv meta ${{ github.workspace }}/CredData/
- name: Set up Python 3.8
if: steps.cache-data.outputs.cache-hit != 'true'
uses: actions/setup-python@v3
with:
python-version: "3.8"
- name: Update PIP
run: python -m pip install --upgrade pip
- name: Checkout current CredSweeper
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
path: CredSweeper.head
- name: Install development packages
run: python -m pip install --requirement CredSweeper.head/requirements.txt
- name: Install experimental packages
# some versions will be changed for compatibility
run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt
- name: dbg
run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}
- name: Lighten spit.json
run: |
mv -vf ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak
grep -v '"[0-9ad-f][0-9a-f]\+' ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json.bak >${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
cat ${{ github.workspace }}/CredSweeper.head/experiment/src/split.json
- name: Do the experiment
run: |
cd CredSweeper.head
ls -al #dbg
pwd #dbg
export PYTHONPATH=$(pwd):${PYTHONPATH}
cd experiment
python -m credsweeper --banner #dbg
python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
ls -al results
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #