35
35
import sys
36
36
import numpy as np
37
37
import time
38
+ import statsmodels .api as sm
38
39
39
40
from random import SystemRandom
40
41
from sklearn import linear_model
@@ -401,9 +402,10 @@ def getDomains(numDomains):
401
402
domains .append (line .rstrip ())
402
403
return domains [0 :numDomains ]
403
404
404
- def getBloomFilters (domains , numCohorts , numHashes , numBloomBits ):
405
+ def getBloomFilters (domains , cohorts , numHashes , numBloomBits ):
406
+ """cohorts is a list of the cohorts that were reported this time"""
405
407
blooms = {}
406
- for m in range ( 0 , numCohorts ) :
408
+ for m in cohorts :
407
409
bloomsForCohort = []
408
410
for d in domains :
409
411
bits = get_bloom_bits (d , m , numHashes , numBloomBits )
@@ -414,43 +416,51 @@ def getBloomFilters(domains, numCohorts, numHashes, numBloomBits):
414
416
#print(blooms)
415
417
return blooms
416
418
417
- def makeDesignMatrix (params , numDomains ):
419
+ def makeDesignMatrix (params , cohorts , numDomains ):
420
+ """"cohorts is a list of the cohorts that were reported this time"""
418
421
k = params .num_bloombits #number of bits in Bloom filter
419
- m = params . num_cohorts
422
+ m = len ( cohorts )
420
423
#M is number of candidate strings
421
424
#h is the number of hash functions per cohort
422
425
domains = getDomains (numDomains )
423
- blooms = getBloomFilters (domains , params . num_cohorts , params .num_hashes , params .num_bloombits )
426
+ blooms = getBloomFilters (domains , cohorts , params .num_hashes , params .num_bloombits )
424
427
425
428
X = np .zeros ([k * m , numDomains ])
426
429
427
430
for cohort in blooms :
428
431
#print("Cohort: ", cohort)
429
432
domainColumn = 0
433
+ rowChunk = 0
430
434
for domain in blooms [cohort ]:
431
435
#domain is a list of the bits that need to be set
432
436
for bitToSet in domain :
433
- X [cohort * k + bitToSet , domainColumn ] = 1
437
+ X [rowChunk * k + bitToSet , domainColumn ] = 1
434
438
domainColumn += 1
439
+ rowChunk += 1
435
440
#print(X)
436
441
return X , domains
437
442
438
443
def doLassoRegression (params , numDomains , reports ):
439
- X , domains = makeDesignMatrix (params , numDomains )
444
+ #reports is supposed to be a dictionary whose keys are the cohort number and whose values are the list of reports from that cohort
445
+ #This needs to be rethought because every time we run this code, we're only using one cohort
446
+ #Need to redesign on client side to send cohort #
447
+ #So server needs to send cohort # when client first requests blocked list
448
+ X , domains = makeDesignMatrix (params , reports .keys (), numDomains )
440
449
Y_list = []
441
- for i in range ( 0 , params . num_cohorts ) :
442
- Y_j = estimateSetBits (reports [i ], params )
450
+ for key in reports :
451
+ Y_j = estimateSetBits (reports [key ], params )
443
452
Y_list .append (Y_j )
444
453
Y = np .array (Y_list )
445
454
Y = Y .flatten ()
446
455
456
+ print ("********X_shape: " , X .shape , ", y-shape: " , Y .shape )
447
457
linreg = linear_model .LassoCV (n_alphas = 10 , cv = 10 )
448
458
linreg .fit (X ,Y )
449
459
print ("Coefficients: " , linreg .coef_ )
450
460
print ("Alpha: " , linreg .alpha_ )
451
461
return X , Y , linreg , domains
452
462
453
- def doLinearRegression (X , Y ):
463
+ def doLinearRegression (X , Y , M ):
454
464
X = sm .add_constant (X )
455
465
ols = sm .OLS (Y ,X )
456
466
results = ols .fit ()
@@ -482,19 +492,24 @@ def readRapporReports(filename):
482
492
reportFile .close ()
483
493
return reps
484
494
485
- def analyzeReports (filename , params , numDomains ):
486
- reps = readRapporReports ()
495
+ def analyzeReports (reps , params , numDomains ):
487
496
lassX , lassY , lassoReg , domains = doLassoRegression (params , numDomains , reps )
488
497
relevantDomains = np .where (lassoReg .coef_ != 0 )[0 ]
489
498
linX = lassX [:,relevantDomains ]
499
+ print ("domains: " , domains , ", relevantDomains: " , relevantDomains )
490
500
#Keep track of the domains the LASSO selected
491
- linDoms = domains [relevantDomains ]
492
- #Confirm that the domains LASSO selected are relevant using linear regression
493
- #This may prune out even more domains than LASSO did
494
- finalRelevantIdxs = doLinearRegression (linX , lassY )
495
- return linDoms [finalRelevantIdxs ]
501
+ if len (relevantDomains ) != 0 :
502
+ doms = np .array (domains )
503
+ rels = np .array (relevantDomains )
504
+ linDoms = doms [rels ]
505
+ #Confirm that the domains LASSO selected are relevant using linear regression
506
+ #This may prune out even more domains than LASSO did
507
+ finalRelevantIdxs = doLinearRegression (linX , lassY , numDomains )
508
+ return linDoms [finalRelevantIdxs ]
509
+ else :
510
+ return []
496
511
497
- def main ():
512
+ """ def main():
498
513
rapporRepsFile = "rappor_reports.txt"
499
514
numDomains = 10
500
515
params = Params(prob_f=0.2)
@@ -517,7 +532,7 @@ def main():
517
532
mbl.write(d+"\r \n ")
518
533
519
534
520
-
535
+ """
521
536
522
537
523
538
0 commit comments