From f8475d88d3170de4553ee005a9d55300a9698081 Mon Sep 17 00:00:00 2001 From: Leo-Simpson Date: Mon, 3 May 2021 13:28:31 +0200 Subject: [PATCH] black the code --- benchmark/C1/bm-C1-plot.py | 15 +- benchmark/C1/bm-C1.py | 46 +-- benchmark/R1/bm-R1-plot.py | 15 +- benchmark/R1/bm-R1.py | 59 ++-- benchmark/R2/bm-R2-plot.py | 16 +- benchmark/R2/bm-R2.py | 72 ++-- classo/__init__.py | 9 +- classo/_version.py | 154 +++++---- classo/alo.py | 71 ++-- classo/compact_func.py | 107 +++--- classo/cross_validation.py | 74 +++-- classo/misc_functions.py | 61 ++-- classo/path_alg.py | 57 ++-- classo/solve_R1.py | 17 +- classo/solve_R2.py | 27 +- classo/solve_R3.py | 16 +- classo/solve_R4.py | 19 +- classo/solver.py | 314 +++++++++--------- classo/stability_selection.py | 93 +++--- classo/tests/test_Classo_R.py | 64 ++-- classo/tests/test_compact_func.py | 60 ++-- classo/tests/test_cross_validation.py | 41 ++- classo/tests/test_misc_func.py | 162 +++++---- classo/tests/test_solver.py | 112 ++++--- classo/tests/test_stability_selection.py | 165 ++++----- .../auto_examples/plot_CentralParkSoil.py | 62 ++-- .../source/auto_examples/plot_Tara_example.py | 59 ++-- .../auto_examples/plot_advanced_example.py | 40 +-- .../auto_examples/plot_basic_example.py | 24 +- .../auto_examples/plot_combo_example.py | 30 +- docs/source/conf.py | 62 ++-- examples/example_COMBO.py | 7 +- examples/example_gneiss.py | 19 +- examples/plot_CentralParkSoil.py | 62 ++-- examples/plot_Tara_example.py | 59 ++-- examples/plot_advanced_example.py | 46 +-- examples/plot_basic_example.py | 24 +- examples/plot_combo_example.py | 35 +- setup.py | 79 +++-- versioneer.py | 278 ++++++++++------ 40 files changed, 1453 insertions(+), 1279 deletions(-) diff --git a/benchmark/C1/bm-C1-plot.py b/benchmark/C1/bm-C1-plot.py index 8fd2cf2..6f60fd9 100644 --- a/benchmark/C1/bm-C1-plot.py +++ b/benchmark/C1/bm-C1-plot.py @@ -2,12 +2,13 @@ import matplotlib.pyplot as plt import os + my_path = os.path.dirname(__file__) -output = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'output')) +output = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) -data = np.load(os.path.join(my_path, 'bm-C1.npz')) +data = np.load(os.path.join(my_path, "bm-C1.npz")) ticks = [str(tuple(size)) for size in data["SIZES"]] @@ -19,7 +20,7 @@ T = data[name] N_size, N_data = T.shape mean = np.mean(T, axis=1) - std = np.std(T, axis=1)/np.sqrt(N_data) + std = np.std(T, axis=1) / np.sqrt(N_data) label = algo[i] plt.errorbar(range(N_size), mean, yerr=std, label=label) @@ -33,13 +34,13 @@ plt.savefig(os.path.join(output, "bm-C1-times.png")) plt.close() -ref = np.min( [np.mean(data[name], axis=1) for name in ["L_pa", "L_cvx"] ], axis=0) +ref = np.min([np.mean(data[name], axis=1) for name in ["L_pa", "L_cvx"]], axis=0) for i, name in enumerate(["L_pa", "L_cvx"]): L = data[name] N_size, N_data = L.shape mean = np.mean(L, axis=1) label = algo[i] - plt.plot(range(N_size), mean-ref, label=label) + plt.plot(range(N_size), mean - ref, label=label) plt.title("Loss - minimum(loss)") plt.xticks(range(N_size), ticks) @@ -54,7 +55,7 @@ C = data[name] N_size, N_data = C.shape mean = np.mean(C, axis=1) - std = np.std(C, axis=1)/np.sqrt(N_data) + std = np.std(C, axis=1) / np.sqrt(N_data) label = algo[i] plt.plot(range(N_size), mean, label=label) @@ -65,4 +66,4 @@ plt.legend() plt.tight_layout() plt.savefig(os.path.join(output, "bm-C1-constraint.png")) -plt.close() \ No newline at end of file +plt.close() diff --git a/benchmark/C1/bm-C1.py b/benchmark/C1/bm-C1.py index dc25af1..d39c8d9 100644 --- a/benchmark/C1/bm-C1.py +++ b/benchmark/C1/bm-C1.py @@ -5,14 +5,14 @@ from time import time import os -my_path = os.path.dirname(__file__) -l = [1, 2, 5, 7] +my_path = os.path.dirname(__file__) +l = [1, 2, 5, 7] def loss(X, y, lamb, beta): - return np.sum(np.maximum(1 - y*X.dot(beta), 0)**2) + lamb*np.sum(abs(beta)) + return np.sum(np.maximum(1 - y * X.dot(beta), 0) ** 2) + lamb * np.sum(abs(beta)) d_nonzero = 5 @@ -27,8 +27,8 @@ def loss(X, y, lamb, beta): SIZES = [] for i in range(len(S)): SIZES.append((S[i], S[i])) - if i+1= rho - h = r**2 - h[F] = 2*rho*abs(r)[F] - rho**2 + h = r ** 2 + h[F] = 2 * rho * abs(r)[F] - rho ** 2 return h - - + def loss(X, y, lamb, rho, beta): r = X.dot(beta) - y - return np.sum(huber(r,rho)) + lamb*np.sum(abs(beta)) + return np.sum(huber(r, rho)) + lamb * np.sum(abs(beta)) d_nonzero = 5 @@ -34,8 +35,8 @@ def loss(X, y, lamb, rho, beta): SIZES = [] for i in range(len(S)): SIZES.append((S[i], S[i])) - if i+1= rho - lamb = lam*2*LA.norm(X[F].T.dot(y[F]),np.infty) + rho = 1.345 * np.sqrt(np.mean(y ** 2)) + F = abs(y) >= rho + lamb = lam * 2 * LA.norm(X[F].T.dot(y[F]), np.infty) t0 = time() # classo Path-Alg @@ -80,7 +79,7 @@ def loss(X, y, lamb, rho, beta): problem.model_selection.LAMfixed = True problem.model_selection.LAMfixedparameters.rescaled_lam = False problem.model_selection.LAMfixedparameters.lam = lamb - problem.model_selection.LAMfixedparameters.numerical_method = 'Path-Alg' + problem.model_selection.LAMfixedparameters.numerical_method = "Path-Alg" problem.solve() b_pa.append(problem.solution.LAMfixed.beta) b_pa = np.array(b_pa) @@ -98,7 +97,7 @@ def loss(X, y, lamb, rho, beta): problem.model_selection.LAMfixed = True problem.model_selection.LAMfixedparameters.rescaled_lam = False problem.model_selection.LAMfixedparameters.lam = lamb - problem.model_selection.LAMfixedparameters.numerical_method = 'P-PDS' + problem.model_selection.LAMfixedparameters.numerical_method = "P-PDS" problem.solve() b_pds.append(problem.solution.LAMfixed.beta) b_pds = np.array(b_pds) @@ -116,7 +115,7 @@ def loss(X, y, lamb, rho, beta): problem.model_selection.LAMfixed = True problem.model_selection.LAMfixedparameters.rescaled_lam = False problem.model_selection.LAMfixedparameters.lam = lamb - problem.model_selection.LAMfixedparameters.numerical_method = 'P-PDS' + problem.model_selection.LAMfixedparameters.numerical_method = "P-PDS" problem.solve() b_dr.append(problem.solution.LAMfixed.beta) b_dr = np.array(b_dr) @@ -126,15 +125,16 @@ def loss(X, y, lamb, rho, beta): b_cvx = [] for j in range(N_per_data): beta = cp.Variable(d) - objective = cp.Minimize(cp.sum(cp.huber(X@beta-y, rho))+ lamb*cp.norm(beta, 1)) - constraints = [C@beta == 0] + objective = cp.Minimize( + cp.sum(cp.huber(X @ beta - y, rho)) + lamb * cp.norm(beta, 1) + ) + constraints = [C @ beta == 0] prob = cp.Problem(objective, constraints) - result = prob.solve(warm_start = False, eps_abs = 1e-5) + result = prob.solve(warm_start=False, eps_abs=1e-5) b_cvx.append(beta.value) b_cvx = np.array(b_cvx) - - t4 = time() + t4 = time() T_pa[s, i] = (t1 - t0) / N_per_data L_pa[s, i] = loss(X, y, lamb, rho, np.mean(b_pa, axis=0)) @@ -148,23 +148,23 @@ def loss(X, y, lamb, rho, beta): L_dr[s, i] = loss(X, y, lamb, rho, np.mean(b_dr, axis=0)) C_dr[s, i] = np.linalg.norm(C.dot(np.mean(b_pds, axis=0))) - T_cvx[s, i] = (t4 - t3) / N_per_data + T_cvx[s, i] = (t4 - t3) / N_per_data L_cvx[s, i] = loss(X, y, lamb, rho, np.mean(b_cvx, axis=0)) C_cvx[s, i] = np.linalg.norm(C.dot(np.mean(b_cvx, axis=0))) np.savez( - os.path.join(my_path, 'bm-R2.npz'), - T_pa = T_pa, - L_pa = L_pa, - C_pa = C_pa, - T_pds = T_pds, - L_pds = L_pds, - C_pds = C_pds, - T_dr = T_dr, - L_dr = L_dr, - C_dr = C_dr, - T_cvx = T_cvx, - L_cvx = L_cvx, - C_cvx = C_cvx, - SIZES = np.array(SIZES) + os.path.join(my_path, "bm-R2.npz"), + T_pa=T_pa, + L_pa=L_pa, + C_pa=C_pa, + T_pds=T_pds, + L_pds=L_pds, + C_pds=C_pds, + T_dr=T_dr, + L_dr=L_dr, + C_dr=C_dr, + T_cvx=T_cvx, + L_cvx=L_cvx, + C_cvx=C_cvx, + SIZES=np.array(SIZES), ) diff --git a/classo/__init__.py b/classo/__init__.py index d580c08..569aefe 100644 --- a/classo/__init__.py +++ b/classo/__init__.py @@ -1,8 +1,4 @@ -from .misc_functions import ( - random_data, - clr, - theoretical_lam -) # , tree_to_matrix +from .misc_functions import random_data, clr, theoretical_lam # , tree_to_matrix from .compact_func import Classo, pathlasso from .cross_validation import CV from .stability_selection import stability @@ -24,5 +20,6 @@ ) from ._version import get_versions -__version__ = get_versions()['version'] + +__version__ = get_versions()["version"] del get_versions diff --git a/classo/_version.py b/classo/_version.py index 4f725b5..1f786a2 100644 --- a/classo/_version.py +++ b/classo/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -58,17 +57,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -114,16 +117,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -183,7 +192,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -192,7 +201,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -200,19 +209,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -227,8 +243,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -236,10 +251,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -262,17 +286,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -281,10 +304,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -295,13 +320,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] @@ -335,8 +360,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -450,11 +474,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -474,9 +500,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions(): @@ -490,8 +520,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -500,13 +529,16 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -520,6 +552,10 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/classo/alo.py b/classo/alo.py index d1d1890..fabe3d1 100644 --- a/classo/alo.py +++ b/classo/alo.py @@ -35,7 +35,7 @@ def generate_data(n, p, k, d, sigma=1, seed=None): """ rng = np.random.Generator(np.random.Philox(seed)) - X = rng.normal(scale = 1 / np.sqrt(k), size=(n, p)) + X = rng.normal(scale=1 / np.sqrt(k), size=(n, p)) C = rng.normal(size=(d, p)) beta_nz = np.ones(k) C_k = C[:, :k] @@ -111,18 +111,30 @@ def alo_cls_h(X: np.ndarray, C: np.ndarray) -> np.ndarray: relying extensively on the cholesky decomposition. """ K = X.T @ X - K_cho, _ = scipy.linalg.cho_factor(K, overwrite_a=True, lower=True, check_finite=False) - K_inv_2_C = scipy.linalg.solve_triangular(K_cho, C.T, lower=True, check_finite=False) - K_inv_2_Xt = scipy.linalg.solve_triangular(K_cho, X.T, lower=True, check_finite=False) + K_cho, _ = scipy.linalg.cho_factor( + K, overwrite_a=True, lower=True, check_finite=False + ) + K_inv_2_C = scipy.linalg.solve_triangular( + K_cho, C.T, lower=True, check_finite=False + ) + K_inv_2_Xt = scipy.linalg.solve_triangular( + K_cho, X.T, lower=True, check_finite=False + ) C_Ki_C = K_inv_2_C.T @ K_inv_2_C - CKC_cho, _ = scipy.linalg.cho_factor(C_Ki_C, overwrite_a=True, lower=True, check_finite=False) - F = scipy.linalg.solve_triangular(CKC_cho, K_inv_2_C.T, lower=True, check_finite=False) + CKC_cho, _ = scipy.linalg.cho_factor( + C_Ki_C, overwrite_a=True, lower=True, check_finite=False + ) + F = scipy.linalg.solve_triangular( + CKC_cho, K_inv_2_C.T, lower=True, check_finite=False + ) return (K_inv_2_Xt ** 2).sum(axis=0) - ((F @ K_inv_2_Xt) ** 2).sum(axis=0) -def alo_h(X: np.ndarray, beta: np.ndarray, y: np.ndarray, C: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: +def alo_h( + X: np.ndarray, beta: np.ndarray, y: np.ndarray, C: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: """Computes the ALO leverage and residual for the c-lasso. Due to its L1 structure, the ALO for the constrained lasso corresponds @@ -145,7 +157,7 @@ def alo_h(X: np.ndarray, beta: np.ndarray, y: np.ndarray, C: np.ndarray) -> Tupl Returns ------- alo_residual : np.ndarray - A numpy array of size [n] representing the estimated ALO residuals + A numpy array of size [n] representing the estimated ALO residuals h : np.ndarray A numpy array of size [n] representing the ALO leverage at each observation. """ @@ -161,8 +173,9 @@ def alo_h(X: np.ndarray, beta: np.ndarray, y: np.ndarray, C: np.ndarray) -> Tupl return (y - X @ beta) / (1 - h), h - -def alo_classo_risk(X: np.ndarray, C: np.ndarray, y: np.ndarray, betas: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: +def alo_classo_risk( + X: np.ndarray, C: np.ndarray, y: np.ndarray, betas: np.ndarray +) -> Tuple[np.ndarray, np.ndarray]: """Computes the ALO risk for the c-lasso at the given estimates. Parameters @@ -197,10 +210,9 @@ def alo_classo_risk(X: np.ndarray, C: np.ndarray, y: np.ndarray, betas: np.ndarr return mse, df - def solve_standard(X, C, y, lambdas=None): """Utility function to solve standard c-lasso formulation.""" - problem = problem_R1((X, C, y), 'Path-Alg') + problem = problem_R1((X, C, y), "Path-Alg") problem.tol = 1e-6 if lambdas is None: @@ -216,22 +228,25 @@ def solve_standard(X, C, y, lambdas=None): def solve_loo_i(X, C, y, i, lambdas): - X = np.concatenate((X[:i], X[i+1:])) - y = np.concatenate((y[:i], y[i+1:])) + X = np.concatenate((X[:i], X[i + 1 :])) + y = np.concatenate((y[:i], y[i + 1 :])) return solve_standard(X, C, y, lambdas) + def _solve_loo_i_beta(i, X, C, y, lambdas): return solve_loo_i(X, C, y, i, lambdas)[0] def _set_sequential_mkl(): import os + try: import mkl + mkl.set_num_threads(1) except ImportError: - os.environ['MKL_NUM_THREADS'] = '1' - os.environ['OMP_NUM_THREADS'] = '1' + os.environ["MKL_NUM_THREADS"] = "1" + os.environ["OMP_NUM_THREADS"] = "1" def solve_loo(X, C, y, progress=False): @@ -242,10 +257,13 @@ def solve_loo(X, C, y, progress=False): """ _, lambdas = solve_standard(X, C, y) - ctx = multiprocessing.get_context('spawn') + ctx = multiprocessing.get_context("spawn") with ctx.Pool(initializer=_set_sequential_mkl) as pool: - result = pool.imap(functools.partial(_solve_loo_i_beta, X=X, C=C, y=y, lambdas=lambdas), range(X.shape[0])) + result = pool.imap( + functools.partial(_solve_loo_i_beta, X=X, C=C, y=y, lambdas=lambdas), + range(X.shape[0]), + ) if progress: result = tqdm.tqdm(result, total=X.shape[0]) result = list(result) @@ -253,12 +271,14 @@ def solve_loo(X, C, y, progress=False): return np.stack(result, axis=0), lambdas - # The functions below are simply helper functions which implement the same functionality for the LASSO (not the C-LASSO) # They are mostly intended for debugging and do not need to be integrated. + def solve_lasso(X, y, lambdas=None): - lambdas, betas, _ = sklearn.linear_model.lasso_path(X, y, intercept=False, lambdas=lambdas) + lambdas, betas, _ = sklearn.linear_model.lasso_path( + X, y, intercept=False, lambdas=lambdas + ) return lambdas, betas.T @@ -270,7 +290,7 @@ def alo_lasso_h(X, y, beta, tol=1e-4): X = X[:, E] K = X.T @ X - H = X @ scipy.linalg.solve(K, X.T, assume_a='pos') + H = X @ scipy.linalg.solve(K, X.T, assume_a="pos") h = np.diag(H) return (y - X @ beta[E]) / (1 - h), h @@ -289,8 +309,8 @@ def alo_lasso_risk(X, y, betas): def _lasso_loo(i, X, y, lambdas): - X_i = np.concatenate((X[:i], X[i+1:])) - y_i = np.concatenate((y[:i], y[i+1:])) + X_i = np.concatenate((X[:i], X[i + 1 :])) + y_i = np.concatenate((y[:i], y[i + 1 :])) return solve_lasso(X_i, y_i, lambdas)[1] @@ -299,10 +319,11 @@ def solve_lasso_loo(X, y, lambdas=None, progress=False): lambdas, _ = solve_lasso(X, y) with multiprocessing.Pool(initializer=_set_sequential_mkl) as pool: - result = pool.imap(functools.partial(_lasso_loo, X=X, y=y, lambdas=lambdas), range(X.shape[0])) + result = pool.imap( + functools.partial(_lasso_loo, X=X, y=y, lambdas=lambdas), range(X.shape[0]) + ) if progress: result = tqdm.tqdm(result, total=X.shape[0]) result = list(result) return lambdas, np.stack(result, axis=0) - diff --git a/classo/compact_func.py b/classo/compact_func.py index 5c59389..c3bc074 100644 --- a/classo/compact_func.py +++ b/classo/compact_func.py @@ -20,16 +20,16 @@ def Classo( matrix, lam, - typ = "R1", - meth = "DR", - rho = 1.345, - get_lambdamax = False, - true_lam = False, - e = None, - rho_classification = -1.0, - w = None, - intercept = False, - return_sigm = True + typ="R1", + meth="DR", + rho=1.345, + get_lambdamax=False, + true_lam=False, + e=None, + rho_classification=-1.0, + w=None, + intercept=False, + return_sigm=True, ): if w is not None: @@ -45,7 +45,7 @@ def Classo( # the intercept is simple beta0 = ybar-Xbar .vdot(beta) # so by changing the X to X-Xbar and y to y-ybar # we can solve standard problem - Xbar, ybar = np.mean(X, axis = 0), np.mean(y) + Xbar, ybar = np.mean(X, axis=0), np.mean(y) matrices = (X - Xbar, C, y - ybar) if meth not in ["Path-Alg", "DR"]: @@ -73,7 +73,7 @@ def Classo( meth = "DR" if e is None or e == len(matrices[0]): r = 1.0 - pb = problem_R4(matrices, meth, rho, intercept = intercept) + pb = problem_R4(matrices, meth, rho, intercept=intercept) e = len(matrices[0]) else: r = np.sqrt(e / len(matrices[0])) @@ -81,7 +81,7 @@ def Classo( (matrices[0] * r, matrices[1], matrices[2] * r), meth, rho / r, - intercept = intercept, + intercept=intercept, ) lambdamax = pb.lambdamax @@ -94,7 +94,7 @@ def Classo( if meth not in ["Path-Alg", "P-PDS", "PF-PDS", "DR"]: meth = "ODE" - pb = problem_R2(matrices, meth, rho, intercept = intercept) + pb = problem_R2(matrices, meth, rho, intercept=intercept) lambdamax = pb.lambdamax if true_lam: beta = Classo_R2(pb, lam / lambdamax) @@ -105,17 +105,21 @@ def Classo( assert set(matrices[2]).issubset({1, -1}) - lambdamax = h_lambdamax( - matrices, rho_classification, typ="C2", intercept = intercept + matrices, rho_classification, typ="C2", intercept=intercept ) if true_lam: out = solve_path( - matrices, lam / lambdamax, False, rho_classification, "C2", intercept = intercept + matrices, + lam / lambdamax, + False, + rho_classification, + "C2", + intercept=intercept, ) else: out = solve_path( - matrices, lam, False, rho_classification, "C2", intercept = intercept + matrices, lam, False, rho_classification, "C2", intercept=intercept ) if intercept: beta0, beta = out[0][-1], out[1][-1] @@ -127,13 +131,13 @@ def Classo( assert set(matrices[2]).issubset({1, -1}) - lambdamax = h_lambdamax(matrices, 0, typ="C1", intercept = intercept) + lambdamax = h_lambdamax(matrices, 0, typ="C1", intercept=intercept) if true_lam: out = solve_path( - matrices, lam / lambdamax, False, 0, "C1", intercept = intercept + matrices, lam / lambdamax, False, 0, "C1", intercept=intercept ) else: - out = solve_path(matrices, lam, False, 0, "C1", intercept = intercept) + out = solve_path(matrices, lam, False, 0, "C1", intercept=intercept) if intercept: beta0, beta = out[0][-1], out[1][-1] beta = np.array([beta0] + list(beta)) @@ -146,7 +150,7 @@ def Classo( # the intercept is simple beta0 = ybar-Xbar .vdot(beta) # so by changing the X to X-Xbar and y to y-ybar # we can solve standard problem - Xbar, ybar = np.mean(X, axis = 0), np.mean(y) + Xbar, ybar = np.mean(X, axis=0), np.mean(y) matrices = (X - Xbar, C, y - ybar) if meth not in ["Path-Alg", "P-PDS", "PF-PDS", "DR"]: @@ -181,25 +185,25 @@ def Classo( def pathlasso( matrix, - lambdas = False, - n_active = 0, - lamin = 1e-2, - typ = "R1", - meth = "Path-Alg", - rho = 1.345, - true_lam = False, - e = None, - return_sigm = False, - rho_classification = -1.0, - w = None, - intercept = False, + lambdas=False, + n_active=0, + lamin=1e-2, + typ="R1", + meth="Path-Alg", + rho=1.345, + true_lam=False, + e=None, + return_sigm=False, + rho_classification=-1.0, + w=None, + intercept=False, ): Nactive = n_active if Nactive == 0: Nactive = False if type(lambdas) is bool: - lambdas = lamin**(np.linspace(0., 1, 100)) + lambdas = lamin ** (np.linspace(0.0, 1, 100)) if lambdas[0] < lambdas[-1]: lambdass = [ @@ -207,7 +211,6 @@ def pathlasso( ] # reverse the list if needed else: lambdass = [lambdas[i] for i in range(len(lambdas))] - if w is not None: matrices = (matrix[0] / w, matrix[1] / w, matrix[2]) @@ -218,16 +221,16 @@ def pathlasso( if typ == "R2": - pb = problem_R2(matrices, meth, rho, intercept = intercept) + pb = problem_R2(matrices, meth, rho, intercept=intercept) lambdamax = pb.lambdamax if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] - BETA = pathlasso_R2(pb, lambdass, n_active = Nactive) + BETA = pathlasso_R2(pb, lambdass, n_active=Nactive) elif typ == "R3": if intercept: # here we use the fact that for R1 and R3, the intercept is simple beta0 = ybar-Xbar .vdot(beta) so by changing the X to X-Xbar and y to y-ybar we can solve standard problem - Xbar, ybar = np.mean(X, axis = 0), np.mean(y) + Xbar, ybar = np.mean(X, axis=0), np.mean(y) matrices = (X - Xbar, C, y - ybar) if e is None or e == len(matrices[0]) / 2: r = 1.0 @@ -238,7 +241,7 @@ def pathlasso( lambdamax = pb.lambdamax if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] - BETA, S = pathlasso_R3(pb, lambdass, n_active = Nactive) + BETA, S = pathlasso_R3(pb, lambdass, n_active=Nactive) S = np.array(S) / r ** 2 BETA = np.array(BETA) if intercept: @@ -248,20 +251,20 @@ def pathlasso( if e is None or e == len(matrices[0]): r = 1.0 - pb = problem_R4(matrices, meth, rho, intercept = intercept) + pb = problem_R4(matrices, meth, rho, intercept=intercept) else: r = np.sqrt(e / len(matrices[0])) pb = problem_R4( (matrices[0] * r, matrices[1], matrices[2] * r), meth, rho / r, - intercept = intercept, + intercept=intercept, ) lambdamax = pb.lambdamax if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] - BETA, S = pathlasso_R4(pb, lambdass, n_active = Nactive) + BETA, S = pathlasso_R4(pb, lambdass, n_active=Nactive) S = np.array(S) / r ** 2 BETA = np.array(BETA) @@ -270,7 +273,7 @@ def pathlasso( assert set(matrices[2]).issubset({1, -1}) lambdamax = h_lambdamax( - matrices, rho_classification, typ="C2", intercept = intercept + matrices, rho_classification, typ="C2", intercept=intercept ) if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] @@ -278,20 +281,20 @@ def pathlasso( matrices, lambdass, "C2", - n_active = Nactive, - rho = rho_classification, - intercept = intercept, + n_active=Nactive, + rho=rho_classification, + intercept=intercept, ) elif typ == "C1": assert set(matrices[2]).issubset({1, -1}) - lambdamax = h_lambdamax(matrices, 0, typ="C1", intercept = intercept) + lambdamax = h_lambdamax(matrices, 0, typ="C1", intercept=intercept) if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] BETA = pathalgo_general( - matrices, lambdass, "C1", n_active = Nactive, intercept = intercept + matrices, lambdass, "C1", n_active=Nactive, intercept=intercept ) else: # R1 @@ -300,18 +303,17 @@ def pathlasso( # the intercept is simple beta0 = ybar-Xbar .vdot(beta) # so by changing the X to X-Xbar and y to y-ybar # we can solve standard problem - Xbar, ybar = np.mean(X, axis = 0), np.mean(y) + Xbar, ybar = np.mean(X, axis=0), np.mean(y) matrices = (X - Xbar, C, y - ybar) pb = problem_R1(matrices, meth) lambdamax = pb.lambdamax if true_lam: lambdass = [lamb / lambdamax for lamb in lambdass] - BETA = pathlasso_R1(pb, lambdass, n_active = n_active) + BETA = pathlasso_R1(pb, lambdass, n_active=n_active) if intercept: BETA = np.array([[ybar - Xbar.dot(beta)] + list(beta) for beta in BETA]) - real_path = [lam * lambdamax for lam in lambdass] if w is not None: @@ -322,7 +324,6 @@ def pathlasso( BETA = np.array([beta / ww for beta in BETA]) - if typ in ["R3", "R4"] and return_sigm: return (np.array(BETA), real_path, S) return (np.array(BETA), real_path) diff --git a/classo/cross_validation.py b/classo/cross_validation.py index e4e3f92..915391c 100644 --- a/classo/cross_validation.py +++ b/classo/cross_validation.py @@ -10,16 +10,16 @@ def train_test_CV(n, k): sublist_len = n // k rest = n % k for i in range(k): - if i == k-1: - SUBLIST.append(idx[i * sublist_len:(i+1) * sublist_len + rest]) + if i == k - 1: + SUBLIST.append(idx[i * sublist_len : (i + 1) * sublist_len + rest]) else: - SUBLIST.append(idx[i * sublist_len:(i+1) * sublist_len]) + SUBLIST.append(idx[i * sublist_len : (i + 1) * sublist_len]) return SUBLIST def train_test_i(SUBLIST, i): - # create training and test sets + # create training and test sets training_set, test_set = np.array([], dtype=int), SUBLIST[i] for j in range(len(SUBLIST)): if j != i: @@ -43,17 +43,18 @@ def training( mat = (A[training_set], C, y[training_set]) sol = pathlasso( mat, - lambdas = lambdas, - typ = typ, - meth = num_meth, - rho = rho, - e = e, - rho_classification = rho_classification, - w = w, - intercept = intercept, + lambdas=lambdas, + typ=typ, + meth=num_meth, + rho=rho, + e=e, + rho_classification=rho_classification, + w=w, + intercept=intercept, )[0] return sol + def cv_test_i( matrices, typ, @@ -88,13 +89,14 @@ def cv_test_i( matrices[2][test_set], BETA[j], typ, - rho = rho, - rho_classification = rho_classification, - intercept = intercept, + rho=rho, + rho_classification=rho_classification, + intercept=intercept, ) return residual + def average_test( matrices, typ, @@ -124,8 +126,8 @@ def average_test( w, intercept, ) - MSE = np.mean(RESIDUAL, axis = 0) - SE = np.std(RESIDUAL, axis = 0) / np.sqrt(k) + MSE = np.mean(RESIDUAL, axis=0) + SE = np.std(RESIDUAL, axis=0) / np.sqrt(k) return (MSE, SE) @@ -134,15 +136,15 @@ def CV( k, typ="R1", num_meth="Path-Alg", - seed = 1, - rho = 1.345, - rho_classification = -1.0, - e = 1.0, - lambdas = None, - Nlam = 100, - oneSE = True, - w = None, - intercept = False, + seed=1, + rho=1.345, + rho_classification=-1.0, + e=1.0, + lambdas=None, + Nlam=100, + oneSE=True, + w=None, + intercept=False, ): if lambdas is None: @@ -174,13 +176,13 @@ def CV( out = Classo( matrices, lam, - typ = typ, - meth = num_meth, - rho = rho, - e = e, - rho_classification = rho_classification, - w = w, - intercept = intercept, + typ=typ, + meth=num_meth, + rho=rho, + e=e, + rho_classification=rho_classification, + w=w, + intercept=intercept, ) return (out, MSE, SE, i, i_1SE) @@ -212,7 +214,7 @@ def accuracy_func( ): if intercept: - Aprime = np.concatenate([np.ones((len(A), 1)), A], axis = 1) + Aprime = np.concatenate([np.ones((len(A), 1)), A], axis=1) else: Aprime = A[:, :] @@ -220,7 +222,7 @@ def accuracy_func( if typ == "R2": return hub(Aprime.dot(beta) - y, rho) / n - elif typ in ["C1","C2"]: + elif typ in ["C1", "C2"]: return misclassification_rate(Aprime, y, beta) else: return LA.norm(Aprime.dot(beta) - y, 2) ** 2 / n @@ -236,4 +238,4 @@ def huber_hinge(A, y, beta, rho): else: s += h[i] ** 2 return s -""" \ No newline at end of file +""" diff --git a/classo/misc_functions.py b/classo/misc_functions.py index c62200f..28cda75 100644 --- a/classo/misc_functions.py +++ b/classo/misc_functions.py @@ -46,8 +46,6 @@ """ - - def theoretical_lam(n, d): r"""Theoretical lambda as a function of the dimensions of the problem @@ -101,20 +99,19 @@ def min_LS(matrices, selected, intercept=False): def affichage( LISTE_BETA, path, - title = " ", - labels = False, - xlabel = " ", - ylabel = " ", - naffichage = 10, + title=" ", + labels=False, + xlabel=" ", + ylabel=" ", + naffichage=10, ): BETAS = np.array(LISTE_BETA) l_index = influence(BETAS, naffichage) plot_betai(labels, l_index, path, BETAS) - plt.title(title), plt.legend(loc = 4, borderaxespad = 0.0) + plt.title(title), plt.legend(loc=4, borderaxespad=0.0) plt.xlabel(xlabel), plt.ylabel(ylabel) - def check_size(X, y, C): samples, d_in_x = min(len(y), len(X)), len(X[0]) X2, y2 = X[:samples], y[:samples] @@ -131,8 +128,8 @@ def check_size(X, y, C): elif d_in_c > d_in_x: C2 = C[:, :d_in_x] else: - C2 = np.zeros((k, d_in_x )) - C2[:, : d_in_c] = C + C2 = np.zeros((k, d_in_x)) + C2[:, :d_in_c] = C return X2, y2, C2 @@ -148,14 +145,14 @@ def random_data( d_nonzero, k, sigma, - zerosum = False, - seed = False, - classification = False, - exp = False, - A = None, - lb_beta = 3, - ub_beta = 10, - intercept = None, + zerosum=False, + seed=False, + classification=False, + exp=False, + A=None, + lb_beta=3, + ub_beta=10, + intercept=None, ): """Generation of random matrices as data such that y = X.sol + sigma. noise @@ -192,7 +189,7 @@ def random_data( reduc = np.random.permutation(d1) < d_nonzero # sol reduc is random int between lb_beta and ub_beta with a random sign. - sol_reduc = np.random.randint(low = lb_beta, high = ub_beta + 1, size = d_nonzero) * ( + sol_reduc = np.random.randint(low=lb_beta, high=ub_beta + 1, size=d_nonzero) * ( np.random.randint(2, size=d_nonzero) * 2 - 1 ) @@ -206,38 +203,36 @@ def random_data( return ((X, np.zeros((1, d1)), y), sol) while True: - C = np.random.randint(low = -1, high = 1, size = (k, d)) + C = np.random.randint(low=-1, high=1, size=(k, d)) if LA.matrix_rank(C) == k: break - while True: # building a sparse solution such that C.A sol = 0 sol = np.zeros(d1) C_reduc = C.dot(A)[:, reduc] if LA.matrix_rank(C_reduc) < k: - list_i = np.random.randint(d1, size = d_nonzero) + list_i = np.random.randint(d1, size=d_nonzero) reduc = np.random.permutation(d1) < d_nonzero continue proj = proj_c(C_reduc, d_nonzero).dot(sol_reduc) sol[reduc] = proj break - y = X.dot(A.dot(sol)) + np.random.randn(n) * sigma - + if intercept is not None: y = y + intercept if classification: y = np.sign(y) if exp: X = np.exp(X) - + return (X, C, y), sol -def clr(array, coef = 0.5): +def clr(array, coef=0.5): """Centered-Log-Ratio transformation Set all non positive entry to a constant coef. Then compute the log of each component. Then substract the mean of each column. @@ -254,7 +249,7 @@ def clr(array, coef = 0.5): null_set = M <= 0.0 M[null_set] = np.ones(M[null_set].shape) * coef M = np.log(M) - return M - np.mean(M, axis = 0) + return M - np.mean(M, axis=0) """ @@ -262,7 +257,7 @@ def clr(array, coef = 0.5): """ -def unpenalized(cmatrices, intercept = False): +def unpenalized(cmatrices, intercept=False): if intercept: A1, C1, y = cmatrices @@ -278,8 +273,8 @@ def unpenalized(cmatrices, intercept = False): M2 = np.concatenate([C, np.zeros((k, k))], axis=1) M = np.concatenate([M1, M2], axis=0) b = np.concatenate([A.T.dot(y), np.zeros(k)]) - sol = LA.lstsq( M, b, rcond=None)[0] - beta = sol[: d] + sol = LA.lstsq(M, b, rcond=None)[0] + beta = sol[:d] return beta @@ -303,7 +298,7 @@ def plot_betai(labels, l_index, path, BETAS): def influence(BETAS, ntop): - means = np.mean(abs(BETAS), axis = 0) + means = np.mean(abs(BETAS), axis=0) top = np.argpartition(means, -ntop)[-ntop:] return np.sort(top) @@ -438,4 +433,4 @@ def tree_to_matrix(tree, label, with_repr = False): return np.array(L)[to_keep].T, np.array(label2)[to_keep], tree_repr -""" \ No newline at end of file +""" diff --git a/classo/path_alg.py b/classo/path_alg.py index 0a589a5..4e3d0c0 100644 --- a/classo/path_alg.py +++ b/classo/path_alg.py @@ -46,7 +46,7 @@ class parameters_for_update: """ - def __init__(self, matrices, lamin, rho, typ, eps_L2 = 1e-3, intercept = False): + def __init__(self, matrices, lamin, rho, typ, eps_L2=1e-3, intercept=False): if typ == "C2" and rho > 1: raise ValueError( "For huberized hinge, rho has to be smaller than 1, but here it is :", @@ -118,9 +118,8 @@ def __init__(self, matrices, lamin, rho, typ, eps_L2 = 1e-3, intercept = False): self.Xt = LA.inv(N + self.eps_L2 * np.eye(len(N))) - # iteration of the function up to solve the path at each breaking points. -def solve_path(matrices, lamin, n_active, rho, typ, intercept = False): +def solve_path(matrices, lamin, n_active, rho, typ, intercept=False): """ This functions will compute the path for all the breaking points : beta is a piecewise linear function of lambda, and only value on the breaking points @@ -151,32 +150,30 @@ def solve_path(matrices, lamin, n_active, rho, typ, intercept = False): for i in range(d * N_frac): still_F = np.any(param.F) - too_active = (n_active > 0 and param.number_act >= n_active) - + too_active = n_active > 0 and param.number_act >= n_active + if not still_F or too_active or param.lam == lamin: if intercept: return BETA0, BETA, LAM else: return BETA, LAM - - #elif not np.any(param.F): + + # elif not np.any(param.F): # print(param.r) # raise ValueError("The problem looks infeasible because the set of active sample became zero, " # "at the iteration {} " - # "for formulation {} " + # "for formulation {} " # "with intercept ? {} " # "with rho equal to {} ".format(i, typ, intercept, rho )) - + up(param) BETA.append(param.beta), LAM.append(param.lam) - #print("inside : ", param.r[ param.F], np.nonzero(param.F)[0] ) - #print(" outside : ", param.r[~param.F], np.nonzero(~param.F)[0]) - + # print("inside : ", param.r[ param.F], np.nonzero(param.F)[0] ) + # print(" outside : ", param.r[~param.F], np.nonzero(~param.F)[0]) if intercept: BETA0.append(param.beta0) - raise ValueError( "The path algorithm did not finsh after %i iterations " % N, @@ -185,7 +182,7 @@ def solve_path(matrices, lamin, n_active, rho, typ, intercept = False): ) -def solve_path_Conc(matrices, stop, n_active = False, lassopath = True, true_lam = False): +def solve_path_Conc(matrices, stop, n_active=False, lassopath=True, true_lam=False): """ This functions will compute the path for all the breaking points : beta is a piecewise linear function of lambda, and only value on the breaking points @@ -255,7 +252,7 @@ def solve_path_Conc(matrices, stop, n_active = False, lassopath = True, true_lam ) -def pathalgo_general(matrix, path, typ, n_active = False, rho = 0, intercept = False): +def pathalgo_general(matrix, path, typ, n_active=False, rho=0, intercept=False): """ This function is only to interpolate the solution path between the breaking points """ @@ -269,7 +266,6 @@ def pathalgo_general(matrix, path, typ, n_active = False, rho = 0, intercept = F B, sp_path = solve_path( matrix, path[-1], n_active, rho, typ, intercept=intercept ) - sp_path.append(path[-1]), B.append(B[-1]) for lam in path: @@ -283,7 +279,6 @@ def pathalgo_general(matrix, path, typ, n_active = False, rho = 0, intercept = F if intercept: BETA = np.array([[BETA0[i]] + list(BETA[i]) for i in range(len(BETA0))]) - return BETA @@ -371,7 +366,7 @@ def up_LS(param): Xt = LA.inv(N + eps_L2 * np.eye(len(N))) beta = beta - lambdamax * beta_dot * dlamb - if dlamb < lam : + if dlamb < lam: s = lam_s_dot + lam / (lam - dlamb) * (s - lam_s_dot) lam -= dlamb @@ -409,7 +404,7 @@ def up_huber(param): lam = param.lam M = param.M r = param.r - + d = len(activity) L = [lam] * d Mat = M[:d, :d] @@ -451,7 +446,7 @@ def up_huber(param): if dl < dlamb: huber_up, j_switch, dlamb = True, j, dl beta = beta - lambdamax * beta_dot * dlamb - if dlamb < lam : + if dlamb < lam: s = lam_s_dot + lam / (lam - dlamb) * (s - lam_s_dot) r = r + ADl * dlamb lam = lam - dlamb @@ -462,9 +457,9 @@ def up_huber(param): if huber_up: F[j_switch] = not F[j_switch] # sufficient : - #F = F | (abs(r) < rho - 1e-6) + # F = F | (abs(r) < rho - 1e-6) # necessary : - #F = F & (abs(r) <= rho + 1e-6) + # F = F & (abs(r) <= rho + 1e-6) if param.intercept: P = A[F] - np.mean(A[F], axis=0) @@ -567,7 +562,7 @@ def up_cl(param): max_up, j_switch, dlamb = True, j, dl beta = beta - lambdamax * beta_dot * dlamb - if dlamb < lam : + if dlamb < lam: s = lam_s_dot + lam / (lam - dlamb) * (s - lam_s_dot) r = r + yADl * dlamb lam = lam - dlamb @@ -579,9 +574,9 @@ def up_cl(param): if max_up: F[j_switch] = not F[j_switch] # sufficient : - #F = F | (r < 1. - 1e-10) + # F = F | (r < 1. - 1e-10) # necessary : - #F = F & (r <= 1. + 1e-10) + # F = F & (r <= 1. + 1e-10) if param.intercept: P = A[F] - np.mean(A[F], axis=0) M[:d, :][:, :d] = 2 * P.T.dot(P) + eps_L2 * np.eye(d) @@ -707,7 +702,7 @@ def up_huber_cl(param): max_up, j_switch, dlamb = True, j, dl beta = beta - lambdamax * beta_dot * dlamb - if dlamb < lam : + if dlamb < lam: s = lam_s_dot + lam / (lam - dlamb) * (s - lam_s_dot) r = r + yADl * dlamb lam = lam - dlamb @@ -718,9 +713,9 @@ def up_huber_cl(param): if max_up: F[j_switch] = not F[j_switch] # sufficient : - #F = F | ( (r < 1. - 1e-10) & (r > rho + 1e-10) ) + # F = F | ( (r < 1. - 1e-10) & (r > rho + 1e-10) ) # necessary : - #F = F & ( (r <= 1. + 1e-10) & (r >= rho - 1e-10) ) + # F = F & ( (r <= 1. + 1e-10) & (r >= rho - 1e-10) ) if np.any(F): if param.intercept: @@ -831,7 +826,7 @@ def next_idr2(liste, mat): return False -def h_lambdamax(matrices, rho, typ = "R1", intercept=False): +def h_lambdamax(matrices, rho, typ="R1", intercept=False): param = parameters_for_update(matrices, 0.0, rho, typ, intercept=intercept) return param.lambdamax @@ -888,7 +883,7 @@ def find_beta0(r, dbeta0, y, rho, typ): return beta0 -def binary_search(f, a, b, tol = 1e-8): +def binary_search(f, a, b, tol=1e-8): c = (a + b) / 2 # if f(a) * f(b) > 0: # print("gradh(min(y)) = ", f(a)) @@ -944,4 +939,4 @@ def pathalgo_cl(matrix, path, n_active = False, intercept = False): -""" \ No newline at end of file +""" diff --git a/classo/solve_R1.py b/classo/solve_R1.py index 3143f80..7d199f1 100644 --- a/classo/solve_R1.py +++ b/classo/solve_R1.py @@ -15,6 +15,7 @@ tol = 1e-5 + def Classo_R1(pb, lam): pb_type = pb.type # can be 'Path-Alg', 'P-PDS' , 'PF-PDS' or 'DR' @@ -35,7 +36,6 @@ def Classo_R1(pb, lam): Anorm = pb.Anorm tol = pb.tol * LA.norm(y) / Anorm # tolerance rescaled - Proj = proj_c(C, d) AtA = pb.AtA Aty = pb.Aty @@ -45,7 +45,6 @@ def Classo_R1(pb, lam): d ) # two vectors usefull to compute the prox of f(b) = sum(wi |bi|) - if pb_type == "PF-PDS": # y1 --> S ; p1 --> p . ; p2 --> y2 (x, v) = pb.init for i in range(pb.N): @@ -72,7 +71,6 @@ def Classo_R1(pb, lam): "The algorithm of PF-PDS did not converge after %i iterations " % pb.N ) - if pb_type == "P-PDS": xbar, x, v = pb.init for i in range(pb.N): @@ -98,12 +96,11 @@ def Classo_R1(pb, lam): "The algorithm of P-PDS did not converge after %i iterations " % pb.N ) - - else: # "DR": + else: # "DR": gamma = gamma / (2 * lam) w = w / (2 * lam) mu, ls, c, root = pb.mu, [], pb.c, 0.0 - Q1, Q2 = QQ(2 * gamma / (mu - 1), A, AtA = pb.AtA, AAt = pb.AAt) + Q1, Q2 = QQ(2 * gamma / (mu - 1), A, AtA=pb.AtA, AAt=pb.AAt) QA, qy = Q1.dot(A), Q1.dot(y) qy_mult = qy * (mu - 1) @@ -138,7 +135,7 @@ def Classo_R1(pb, lam): """ -def pathlasso_R1(pb, path, n_active = False): +def pathlasso_R1(pb, path, n_active=False): n, d, k = pb.dim BETA, tol = [], pb.tol if pb.type == "Path-Alg": @@ -256,10 +253,12 @@ def prox(b, w, zeros): # Compute I - C^t (C.C^t)^-1 . C : the projection on Ker(C) def proj_c(M, d): k = len(M) - return np.eye(d) - LA.multi_dot([M.T, np.linalg.inv(M.dot(M.T)+ 1e-4 * np.eye(k)), M]) + return np.eye(d) - LA.multi_dot( + [M.T, np.linalg.inv(M.dot(M.T) + 1e-4 * np.eye(k)), M] + ) -def QQ(coef, A, AtA = None, AAt = None): +def QQ(coef, A, AtA=None, AAt=None): if AtA is None: AtA = (A.T).dot(A) if AAt is None: diff --git a/classo/solve_R2.py b/classo/solve_R2.py index d69967e..c42411d 100644 --- a/classo/solve_R2.py +++ b/classo/solve_R2.py @@ -17,7 +17,8 @@ tol = 1e-5 -def Classo_R2(pb, lam, compute = True): + +def Classo_R2(pb, lam, compute=True): pb_type = pb.type # can be 'Path-Alg', 'P-PDS' , 'PF-PDS' or 'DR' @@ -51,7 +52,7 @@ def Classo_R2(pb, lam, compute = True): r = lamb / (2 * rho) if pb_type == "DR": if compute: - pb.init_R1(r = r) + pb.init_R1(r=r) x = Classo_R1(pb.prob_R1, lamb / pb.prob_R1.lambdamax) beta = x[:-m] if pb.intercept: @@ -59,7 +60,7 @@ def Classo_R2(pb, lam, compute = True): beta = np.array([betaO] + list(beta)) return beta else: - pb.add_r(r = r) + pb.add_r(r=r) if len(pb.init) == 3: pb.prob_R1.init = pb.init x, warm_start = Classo_R1(pb.prob_R1, lamb / pb.prob_R1.lambdamax) @@ -115,7 +116,7 @@ def Classo_R2(pb, lam, compute = True): "The algorithm of P-PDS did not converge after %i iterations " % pb.N ) - else: # "PF-PDS" + else: # "PF-PDS" for i in range(pb.N): grad = AtA.dot(x) - Aty @@ -158,7 +159,7 @@ def Classo_R2(pb, lam, compute = True): """ -def pathlasso_R2(pb, path, n_active = False): +def pathlasso_R2(pb, path, n_active=False): n, d, k = pb.dim BETA, tol = [], pb.tol if pb.type == "Path-Alg": @@ -179,7 +180,7 @@ def pathlasso_R2(pb, path, n_active = False): else: n_act = d + 1 for lam in path: - X = Classo_R2(pb, lam, compute = False) + X = Classo_R2(pb, lam, compute=False) BETA.append(X[0]) pb.init = X[1] if ( @@ -203,7 +204,7 @@ def pathlasso_R2(pb, path, n_active = False): class problem_R2: - def __init__(self, data, algo, rho, intercept = False): + def __init__(self, data, algo, rho, intercept=False): self.N = 500000 (AA, CC, y) = data @@ -213,8 +214,8 @@ def __init__(self, data, algo, rho, intercept = False): self.intercept = intercept if intercept: # add a column of 1 in A, and change weight. - A = np.concatenate([np.ones((len(A), 1)), A], axis = 1) - C = np.concatenate([np.zeros((len(C), 1)), C], axis = 1) + A = np.concatenate([np.ones((len(A), 1)), A], axis=1) + C = np.concatenate([np.zeros((len(C), 1)), C], axis=1) self.weights = np.concatenate([[0.0], self.weights]) yy = y - np.mean(y) @@ -254,7 +255,7 @@ def compute_param(self): self.tauN = self.tau / self.Cnorm self.AtAnorm = LA.norm(self.AtA, 2) - def init_R1(self, r = 0.0): + def init_R1(self, r=0.0): (AA, CC, y) = self.matrix A, C = AA[:, :], CC[:, :] (m, d, k) = self.dim @@ -265,7 +266,7 @@ def init_R1(self, r = 0.0): Chuber = np.append(C, np.zeros((k, m)), 1) yhuber = y if self.intercept: - Abar = np.mean(Ahuber, axis = 0) + Abar = np.mean(Ahuber, axis=0) ybar = np.mean(y) Ahuber = Ahuber - Abar yhuber = yhuber - ybar @@ -276,7 +277,7 @@ def init_R1(self, r = 0.0): if self.intercept: prob.Abar = Abar prob.ybar = ybar - self.AAt = (A - np.mean(A, axis = 0)).dot((A - np.mean(A, axis = 0)).T) + self.AAt = (A - np.mean(A, axis=0)).dot((A - np.mean(A, axis=0)).T) else: self.AAt = A.dot(A.T) self.prob_R1 = prob @@ -293,7 +294,7 @@ def add_r(self, r): extension = np.eye(m) if self.intercept: # self.init_R1(r=r) - extension = extension - np.mean(extension, axis = 0) + extension = extension - np.mean(extension, axis=0) extension = r * extension A_r1[:, d:] = extension right_bottom = extension.dot(extension) diff --git a/classo/solve_R3.py b/classo/solve_R3.py index 7252ef3..f49d36a 100644 --- a/classo/solve_R3.py +++ b/classo/solve_R3.py @@ -17,6 +17,7 @@ tol = 1e-5 + def Classo_R3(pb, lam): pb_type = pb.type # can be 'Path-Alg' or 'DR' (m, d, k), (A, C, y) = pb.dim, pb.matrix @@ -35,7 +36,7 @@ def Classo_R3(pb, lam): # Then we only have to finc the solution between the last beta computed and the one before. if pb_type == "Path-Alg": (beta1, beta2), (s1, s2), (r1, r2) = solve_path_Conc( - (A, C, y), lam, lassopath = False + (A, C, y), lam, lassopath=False ) dr, ds = r1 - r2, s1 - s2 teta = root_2( @@ -47,12 +48,12 @@ def Classo_R3(pb, lam): beta = beta1 * teta + beta2 * (1 - teta) return (beta, sigma) - else: # DR + else: # DR regpath = pb.regpath if not regpath: pb.compute_param() - lamb = lam * pb.lambdamax + lamb = lam * pb.lambdamax Anorm = pb.Anorm tol = pb.tol * LA.norm(y) / Anorm # tolerance rescaled Proj = proj_c(C, d) # Proj = I - C^t . (C . C^t )^-1 . C @@ -68,8 +69,7 @@ def Classo_R3(pb, lam): mu, c, root = pb.mu, pb.c, 0.0 xs, nu, o, xbar, x = pb.init - - b, s = 0., 0. # just for flake8 purpose. + b, s = 0.0, 0.0 # just for flake8 purpose. for i in range(pb.N): nv_b = x + Q1.dot(o) - QA.dot(x) - Q2.dot(x - xbar) nv_s = (xs + nu) / 2 @@ -82,7 +82,7 @@ def Classo_R3(pb, lam): s, b = nv_s, nv_b Ab = A.dot(b) - p1, p2, root = prox_phi_1(xs, 2 * Ab - o - y, np.sqrt(m)*gamma / c, root) + p1, p2, root = prox_phi_1(xs, 2 * Ab - o - y, np.sqrt(m) * gamma / c, root) sup1 = max(0, nu) - s sup2 = p1 - s sup3 = p2 + y - Ab @@ -112,7 +112,7 @@ def Classo_R3(pb, lam): """ -def pathlasso_R3(pb, path, n_active = False): +def pathlasso_R3(pb, path, n_active=False): n, d, k = pb.dim BETA, SIGMA, tol = [], [], pb.tol @@ -199,7 +199,7 @@ def __init__(self, data, algo): self.proj_sigm = lambda x: max(x, 0) self.mu = 1.95 - self.gam = 1. #np.sqrt(d/m) + self.gam = 1.0 # np.sqrt(d/m) self.Aty = (A.T).dot(y) self.sigmax = LA.norm(y) / np.sqrt(m / 2) self.lambdamax = 2 * LA.norm(self.Aty, np.infty) / self.sigmax diff --git a/classo/solve_R4.py b/classo/solve_R4.py index 908cf49..b495bbd 100644 --- a/classo/solve_R4.py +++ b/classo/solve_R4.py @@ -16,6 +16,7 @@ tol = 1e-5 + def Classo_R4(pb, lam): pb_type = pb.type # can be 'Path-Alg' or 'DR' (m, d, k), (A, C, y) = pb.dim, pb.matrix @@ -35,14 +36,14 @@ def Classo_R4(pb, lam): # trick of mean-shift formulation explained in the pdf "concomitant huber" # problem of e ==> same trick to do as explained as in the end of the file compact_func, with r = np.sqrt(2) - A_aug = np.sqrt(2) * np.concatenate((A, lamb / (2 * rho) * np.eye(m)), axis = 1) - C_aug = np.concatenate((C, np.zeros((k, m))), axis = 1) + A_aug = np.sqrt(2) * np.concatenate((A, lamb / (2 * rho) * np.eye(m)), axis=1) + C_aug = np.concatenate((C, np.zeros((k, m))), axis=1) y_aug = np.sqrt(2) * y if pb.intercept: A_aug = A_aug[:, 1:] C_aug = C_aug[:, 1:] - Abar = np.mean(A_aug, axis = 0) + Abar = np.mean(A_aug, axis=0) ybar = np.mean(y_aug) A_aug = A_aug - Abar y_aug = y_aug - ybar @@ -60,7 +61,7 @@ def Classo_R4(pb, lam): # Hence, the prox is not so easy to compute because there is a root of polynomial of degree 3 to compute. # We do that in the function prox_phi_2 which use the function prox_phi_i (prox of one componant), # and it uses calc_Newton which uses newton's method with good initialization. - else: # "DR": + else: # "DR": if not regpath: pb.compute_param() @@ -81,7 +82,7 @@ def Classo_R4(pb, lam): root = [0.0] * len(y) xs, nu, o, xbar, x = pb.init - b,s = 0., 0. # just for flake8 purpose + b, s = 0.0, 0.0 # just for flake8 purpose for i in range(pb.N): nv_b = x + Q1.dot(o) - QA.dot(x) - Q2.dot(x - xbar) nv_s = (xs + nu) / 2 @@ -128,7 +129,7 @@ def Classo_R4(pb, lam): """ -def pathlasso_R4(pb, path, n_active = False): +def pathlasso_R4(pb, path, n_active=False): n, d, k = pb.dim BETA, SIGMA, tol = [], [], pb.tol pb.type = "DR" @@ -166,7 +167,7 @@ def pathlasso_R4(pb, path, n_active = False): class problem_R4: - def __init__(self, data, algo, rho, intercept = False): + def __init__(self, data, algo, rho, intercept=False): self.N = 500000 (AA, C, y) = data @@ -175,8 +176,8 @@ def __init__(self, data, algo, rho, intercept = False): self.intercept = intercept if intercept: # add a column of 1 in A, and change weight. - A = np.concatenate([np.ones((len(A), 1)), A], axis = 1) - C = np.concatenate([np.ones((len(C), 1)), C], axis = 1) + A = np.concatenate([np.ones((len(A), 1)), A], axis=1) + C = np.concatenate([np.ones((len(C), 1)), C], axis=1) self.weights = np.concatenate([[0.0], self.weights]) # not exactly what it should be... yy = y - np.mean(y) diff --git a/classo/solver.py b/classo/solver.py index 65878a6..896ee6b 100644 --- a/classo/solver.py +++ b/classo/solver.py @@ -20,12 +20,8 @@ import numpy as np import matplotlib.pyplot as plt -from .misc_functions import ( - theoretical_lam, - min_LS, - affichage, - check_size -) +from .misc_functions import theoretical_lam, min_LS, affichage, check_size + # from .misc_functions import tree_to_matrix from .compact_func import Classo, pathlasso from .cross_validation import CV @@ -59,9 +55,9 @@ class classo_problem: """ def __init__( - self, X, y, C = None, Tree = None, label = None + self, X, y, C=None, Tree=None, label=None ): # zero sum constraint by default, but it can be any matrix - self.data = Data(X, y, C, Tree = Tree, label = label) + self.data = Data(X, y, C, Tree=Tree, label=label) self.formulation = Formulation() self.model_selection = Model_selection() self.solution = Solution() @@ -91,7 +87,6 @@ def solve(self): else: self.formulation.e = n / 2 - if self.formulation.w is not None: if min(self.formulation.w) < 1e-8: raise ValueError( @@ -123,12 +118,14 @@ def solve(self): yy = data.y - np.mean(data.y) else: yy = data.y - + if self.formulation.scale_rho: - self.formulation.rho_scaled = self.formulation.rho * np.sqrt(np.mean(yy**2)) - else: + self.formulation.rho_scaled = self.formulation.rho * np.sqrt( + np.mean(yy ** 2) + ) + else: self.formulation.rho_scaled = self.formulation.rho - + label = data.label # Compute the path thanks to the class solution_path which contains directely the computation in the initialisation @@ -141,7 +138,6 @@ def solve(self): label, ) - # Compute the ALO thanks to the class solution_ALO which contains directely the computation in the initialisation if self.model_selection.ALO: self.solution.ALO = solution_ALO( @@ -152,7 +148,6 @@ def solve(self): label, ) - # Compute the cross validation thanks to the class solution_CV which contains directely the computation in the initialisation if self.model_selection.CV: self.solution.CV = solution_CV( @@ -202,8 +197,7 @@ def __repr__(self): if self.model_selection.ALO: print_parameters += ( - "\n \nALO PARAMETERS: " - + self.model_selection.ALOparameters.__repr__() + "\n \nALO PARAMETERS: " + self.model_selection.ALOparameters.__repr__() ) if self.model_selection.CV: @@ -250,7 +244,7 @@ class Data: """ - def __init__(self, X, y, C, Tree = None, label = None): + def __init__(self, X, y, C, Tree=None, label=None): X1, y1, C1 = check_size(X, y, C) if Tree is None: @@ -260,8 +254,7 @@ def __init__(self, X, y, C, Tree = None, label = None): self.label = np.array(label) self.X, self.y, self.C, self.tree = X1, y1, C1, None - - #else: + # else: # A, label2, subtree = tree_to_matrix(Tree, label, with_repr = True) # self.tree = subtree # self.X, self.y, self.C, self.label = ( @@ -313,12 +306,12 @@ class Formulation: (i.e. at least one sample is on the quadratic mode of the huber loss function) as long as rho is higher than one. Default value : True - + rho_scaled (float): Actual rho after solving Default value : Not defined - rho_classification (float) : value of rho for huberized hinge loss function for classification ie C2 + rho_classification (float) : value of rho for huberized hinge loss function for classification ie C2 (it has to be strictly smaller then 1). Default value : -1. @@ -400,24 +393,24 @@ class Model_selection: """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): # Model selection variables self.PATH = False - self.PATHparameters = PATHparameters(method = method) + self.PATHparameters = PATHparameters(method=method) self.ALO = False - self.ALOparameters = ALOparameters(method = method) + self.ALOparameters = ALOparameters(method=method) self.CV = False - self.CVparameters = CVparameters(method = method) + self.CVparameters = CVparameters(method=method) self.StabSel = True # Only model selection that is used by default - self.StabSelparameters = StabSelparameters(method = method) + self.StabSelparameters = StabSelparameters(method=method) self.LAMfixed = False - self.LAMfixedparameters = LAMfixedparameters(method = method) + self.LAMfixedparameters = LAMfixedparameters(method=method) def __repr__(self): string = "" @@ -469,7 +462,7 @@ class PATHparameters: """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): self.formulation = "not specified" self.numerical_method = method self.n_active = 0 @@ -534,7 +527,7 @@ class ALOparameters: """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): self.formulation = "not specified" self.numerical_method = method self.n_active = 0 @@ -599,7 +592,7 @@ class CVparameters: """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): self.seed = 0 self.formulation = "not specified" self.numerical_method = method @@ -682,7 +675,7 @@ class StabSelparameters: """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): self.seed = 123 self.formulation = "not specified" self.numerical_method = method @@ -745,7 +738,7 @@ class LAMfixedparameters: Default value : None """ - def __init__(self, method = "not specified"): + def __init__(self, method="not specified"): self.lam = "theoretical" self.formulation = "not specified" self.numerical_method = method @@ -760,7 +753,7 @@ def __repr__(self): string += "\n threshold : average of the absolute value of beta" else: string += "\n threshold = " + str(round(self.threshold, 3)) - + if type(self.lam) is str: string += "\n lam : " + self.lam else: @@ -790,7 +783,9 @@ def __init__(self): self.PATH = "not computed" # this will be filled with an object of the class 'solution_PATH' when the method solve() will be used. self.ALO = "not computed" # this will be filled with an object of the class 'solution_ALO' when the method solve() will be used. self.CV = "not computed" # will be an object of the class 'solution_PATH' - self.StabSel = "not computed" # will be an object of the class 'solution_StabSel' + self.StabSel = ( + "not computed" # will be an object of the class 'solution_StabSel' + ) self.LAMfixed = "not computed" def __repr__(self): @@ -798,7 +793,7 @@ def __repr__(self): for obj in [self.LAMfixed, self.PATH, self.ALO, self.CV, self.StabSel]: if not type(obj) is str: string += obj.__repr__() + "\n" - + return string @@ -849,17 +844,17 @@ def __init__(self, matrices, param, formulation, numerical_method, label): out = pathlasso( matrices, - lambdas = param.lambdas, - n_active = param.n_active, - typ = name_formulation, - meth = numerical_method, - return_sigm = True, - rho = rho, - e = e, - rho_classification = rho_classification, - w = param.formulation.w, - intercept = param.formulation.intercept, - true_lam = not param.rescaled_lam + lambdas=param.lambdas, + n_active=param.n_active, + typ=name_formulation, + meth=numerical_method, + return_sigm=True, + rho=rho, + e=e, + rho_classification=rho_classification, + w=param.formulation.w, + intercept=param.formulation.intercept, + true_lam=not param.rescaled_lam, ) if formulation.concomitant: self.BETAS, self.LAMBDAS, self.SIGMAS = out @@ -882,7 +877,7 @@ def __repr__(self): if ( d > 20 ): # this trick is to plot only the biggest value, excluding the intercept - avg_betas = np.mean(abs(np.array(self.BETAS)), axis = 0) + avg_betas = np.mean(abs(np.array(self.BETAS)), axis=0) if self.formulation.intercept: avg_betas[0] = 0 # trick to exclude intercept in the graph string += "\n There is also an intercept. " @@ -901,15 +896,15 @@ def __repr__(self): else: xGraph = self.LAMBDAS xlabel = PATH_beta_path["xlabel"] - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) affichage( self.BETAS[:, top], xGraph, - labels = self.label[top], - naffichage = 5, - title = PATH_beta_path["title"] + self.formulation.name(), - xlabel = xlabel, - ylabel = PATH_beta_path["ylabel"], + labels=self.label[top], + naffichage=5, + title=PATH_beta_path["title"] + self.formulation.name(), + xlabel=xlabel, + ylabel=PATH_beta_path["ylabel"], ) plt.tight_layout() @@ -917,7 +912,7 @@ def __repr__(self): plt.savefig(self.save + "Beta-path") plt.show(block=False) if type(self.SIGMAS) != str and self.plot_sigma: - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.plot(self.LAMBDAS, self.SIGMAS), plt.ylabel( PATH_sigma_path["ylabel"] ), plt.xlabel(PATH_sigma_path["xlabel"]) @@ -990,16 +985,16 @@ def __init__(self, matrices, param, formulation, numerical_method, label): (out, self.yGraph, self.standard_error, self.index_min, self.index_1SE,) = CV( matrices, param.Nsubset, - typ = name_formulation, - num_meth = numerical_method, - lambdas = param.lambdas, - seed = param.seed, - rho = rho, - rho_classification = rho_classification, - oneSE = param.oneSE, - e = e, - w = param.formulation.w, - intercept = param.formulation.intercept, + typ=name_formulation, + num_meth=numerical_method, + lambdas=param.lambdas, + seed=param.seed, + rho=rho, + rho_classification=rho_classification, + oneSE=param.oneSE, + e=e, + w=param.formulation.w, + intercept=param.formulation.intercept, ) self.xGraph = param.lambdas @@ -1016,7 +1011,7 @@ def __init__(self, matrices, param, formulation, numerical_method, label): abs(self.beta) > 1e-5 ) # boolean array, false iff beta_i = 0 self.refit = min_LS( - matrices, self.selected_param, intercept = self.formulation.intercept + matrices, self.selected_param, intercept=self.formulation.intercept ) self.time = time() - t0 self.save1 = False @@ -1033,7 +1028,7 @@ def __repr__(self): selected[0] = False string += "\n Intercept : " + str(self.refit[0]) - self.graphic(save = self.save1, logscale = self.logscale) + self.graphic(save=self.save1, logscale=self.logscale) nb_select = sum(selected) if nb_select > 10: @@ -1041,11 +1036,11 @@ def __repr__(self): top = np.sort(top) else: top = np.arange(nb_select) - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.bar(range(nb_select), self.refit[selected]) plt.title(CV_beta["title"]) plt.xlabel(CV_beta["xlabel"]), plt.ylabel(CV_beta["ylabel"]) - plt.xticks(top, self.label[selected][top], rotation = 90) + plt.xticks(top, self.label[selected][top], rotation=90) plt.tight_layout() if type(self.save2) == str: plt.savefig(self.save2) @@ -1058,7 +1053,7 @@ def __repr__(self): string += "\n Running time : " + str(round(self.time, 3)) + "s" return string - def graphic(self, se_max = None, save = None, logscale = True, errorevery = 5): + def graphic(self, se_max=None, save=None, logscale=True, errorevery=5): """Method to plot the graphic showing mean squared error over along lambda path once cross validation is computed. Args: @@ -1073,7 +1068,7 @@ def graphic(self, se_max = None, save = None, logscale = True, errorevery = 5): Default value : None """ - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) i_min, i_1SE = self.index_min, self.index_1SE @@ -1091,31 +1086,31 @@ def graphic(self, se_max = None, save = None, logscale = True, errorevery = 5): np.log10(self.xGraph[jmin : jmax + 1]), self.yGraph[jmin : jmax + 1], self.standard_error[jmin : jmax + 1], - label = "mean over the k groups of data", - errorevery = errorevery, + label="mean over the k groups of data", + errorevery=errorevery, ) plt.xlabel(r"$ \log_{10} \lambda / \lambda_{max}$") plt.axvline( - x = np.log10(self.xGraph[i_min]), - color = "k", - label = r"$\lambda$ (min MSE)", + x=np.log10(self.xGraph[i_min]), + color="k", + label=r"$\lambda$ (min MSE)", ) plt.axvline( - x = np.log10(self.xGraph[i_1SE]), - color = "r", - label = r"$\lambda$ (1SE) ", + x=np.log10(self.xGraph[i_1SE]), + color="r", + label=r"$\lambda$ (1SE) ", ) else: plt.errorbar( self.xGraph[jmin : jmax + 1], self.yGraph[jmin : jmax + 1], self.standard_error[jmin : jmax + 1], - label = "mean over the k groups of data", - errorevery = errorevery, + label="mean over the k groups of data", + errorevery=errorevery, ) plt.xlabel(r"$\lambda / \lambda_{max}$") - plt.axvline(x = self.xGraph[i_min], color = "k", label = r"$\lambda$ (min MSE)") - plt.axvline(x = self.xGraph[i_1SE], color = "r", label = r"$\lambda$ (1SE) ") + plt.axvline(x=self.xGraph[i_min], color="k", label=r"$\lambda$ (min MSE)") + plt.axvline(x=self.xGraph[i_1SE], color="r", label=r"$\lambda$ (1SE) ") plt.title(CV_graph["title"]) if self.formulation.classification: plt.ylabel(CV_graph["ylabel_classification"]) @@ -1175,17 +1170,17 @@ def __init__(self, matrices, param, formulation, numerical_method, label): out = pathlasso( matrices, - lambdas = param.lambdas, - n_active = param.n_active, - typ = name_formulation, - meth = numerical_method, - return_sigm = True, - rho = rho, - e = e, - rho_classification = rho_classification, - w = param.formulation.w, - intercept = param.formulation.intercept, - true_lam = not param.rescaled_lam + lambdas=param.lambdas, + n_active=param.n_active, + typ=name_formulation, + meth=numerical_method, + return_sigm=True, + rho=rho, + e=e, + rho_classification=rho_classification, + w=param.formulation.w, + intercept=param.formulation.intercept, + true_lam=not param.rescaled_lam, ) if formulation.concomitant: self.BETAS, self.LAMBDAS, self.SIGMAS = out @@ -1198,21 +1193,21 @@ def __init__(self, matrices, param, formulation, numerical_method, label): self.method = numerical_method self.save = False self.label = label - - # ALO part, for + + # ALO part, for X, C, y = matrices if ( not formulation.concomitant and not formulation.classification - and not formulation.huber): + and not formulation.huber + ): self.mse, self.df = alo_classo_risk(X, C, y, self.BETAS) else: raise ValueError("ALO is implemented only for R1.") - self.time = time() - t0 def __repr__(self): @@ -1223,7 +1218,7 @@ def __repr__(self): if ( d > 20 ): # this trick is to plot only the biggest value, excluding the intercept - avg_betas = np.mean(abs(np.array(self.BETAS)), axis = 0) + avg_betas = np.mean(abs(np.array(self.BETAS)), axis=0) if self.formulation.intercept: avg_betas[0] = 0 # trick to exclude intercept in the graph string += "\n There is also an intercept. " @@ -1242,15 +1237,15 @@ def __repr__(self): else: xGraph = self.LAMBDAS xlabel = PATH_beta_path["xlabel"] - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) affichage( self.BETAS[:, top], xGraph, - labels = self.label[top], - naffichage = 5, - title = PATH_beta_path["title"] + self.formulation.name(), - xlabel = xlabel, - ylabel = PATH_beta_path["ylabel"], + labels=self.label[top], + naffichage=5, + title=PATH_beta_path["title"] + self.formulation.name(), + xlabel=xlabel, + ylabel=PATH_beta_path["ylabel"], ) plt.tight_layout() @@ -1258,7 +1253,7 @@ def __repr__(self): plt.savefig(self.save + "Beta-path") plt.show(block=False) if type(self.SIGMAS) != str and self.plot_sigma: - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.plot(self.LAMBDAS, self.SIGMAS), plt.ylabel( PATH_sigma_path["ylabel"] ), plt.xlabel(PATH_sigma_path["xlabel"]) @@ -1318,8 +1313,8 @@ def __init__(self, matrices, param, formulation, numerical_method, label): param.numerical_method, "StabSel", param.formulation, - StabSelmethod = param.method, - lam = lam, + StabSelmethod=param.method, + lam=lam, ) param.numerical_method = numerical_method @@ -1334,22 +1329,22 @@ def __init__(self, matrices, param, formulation, numerical_method, label): # Compute the distribution output = stability( matrices, - StabSelmethod = param.method, - numerical_method = numerical_method, - lamin = param.lamin, - lam = lam, - Nlam = param.Nlam, - q = param.q, - B = param.B, - percent_nS = param.percent_nS, - formulation = name_formulation, - seed = param.seed, - rho = rho, - rho_classification = rho_classification, - true_lam = not param.rescaled_lam, - e = e, - w = param.formulation.w, - intercept = param.formulation.intercept, + StabSelmethod=param.method, + numerical_method=numerical_method, + lamin=param.lamin, + lam=lam, + Nlam=param.Nlam, + q=param.q, + B=param.B, + percent_nS=param.percent_nS, + formulation=name_formulation, + seed=param.seed, + rho=rho, + rho_classification=rho_classification, + true_lam=not param.rescaled_lam, + e=e, + w=param.formulation.w, + intercept=param.formulation.intercept, ) if param.method == "first": @@ -1370,7 +1365,7 @@ def __init__(self, matrices, param, formulation, numerical_method, label): self.refit = min_LS( matrices, self.selected_param, - intercept = param.formulation.intercept, + intercept=param.formulation.intercept, ) self.save1 = False self.save2 = False @@ -1394,7 +1389,7 @@ def __repr__(self): self.to_label = self.distribution > self.threshold_label D = self.distribution[top] - Dpath = self.distribution_path, + Dpath = (self.distribution_path,) selected = self.selected_param[top] unselected = [not i for i in selected] @@ -1403,29 +1398,29 @@ def __repr__(self): D[selected], D[unselected], ) - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.bar( range(len(Dselected)), Dselected, - color = "r", - label = "selected coefficients", + color="r", + label="selected coefficients", ) plt.bar( range(len(Dunselected)), Dunselected, - color = "b", - label = "unselected coefficients", + color="b", + label="unselected coefficients", ) plt.axhline( - y = self.threshold, - color = "g", - label = "Threshold : thresh = " + str(self.threshold), + y=self.threshold, + color="g", + label="Threshold : thresh = " + str(self.threshold), ) plt.xticks( - ticks = np.where(self.to_label[top])[0], - labels = self.label[top][self.to_label[top]], - rotation = 30, + ticks=np.where(self.to_label[top])[0], + labels=self.label[top][self.to_label[top]], + rotation=30, ) plt.xlabel(StabSel_graph["xlabel"]), plt.ylabel( StabSel_graph["ylabel"] @@ -1467,7 +1462,7 @@ def __repr__(self): plt.show(block=False) """ - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.bar(range(len(self.refit[top])), self.refit[top]) plt.xlabel(StabSel_beta["xlabel"]), plt.ylabel( StabSel_beta["ylabel"] @@ -1475,7 +1470,7 @@ def __repr__(self): plt.xticks( np.where(selected)[0], self.label[top][selected], - rotation = 30, + rotation=30, ) plt.tight_layout() if type(self.save2) == str: @@ -1529,7 +1524,7 @@ def __init__(self, matrices, param, formulation, numerical_method, label): # Algorithmic method choosing numerical_method = choose_numerical_method( - param.numerical_method, "LAM", param.formulation, lam = self.lam + param.numerical_method, "LAM", param.formulation, lam=self.lam ) param.numerical_method = numerical_method self.rescaled_lam = param.rescaled_lam @@ -1538,27 +1533,27 @@ def __init__(self, matrices, param, formulation, numerical_method, label): out = Classo( matrices, self.lam, - typ = name_formulation, - meth = numerical_method, - rho = rho, - get_lambdamax = True, - true_lam = not self.rescaled_lam, - e = e, - rho_classification = rho_classification, - w = param.formulation.w, - intercept = param.formulation.intercept, + typ=name_formulation, + meth=numerical_method, + rho=rho, + get_lambdamax=True, + true_lam=not self.rescaled_lam, + e=e, + rho_classification=rho_classification, + w=param.formulation.w, + intercept=param.formulation.intercept, ) if param.formulation.concomitant: self.lambdamax, self.beta, self.sigma = out else: self.lambdamax, self.beta = out - + if param.rescaled_lam: self.lamb = self.lambdamax * self.lam else: self.lamb = self.lam - + if param.threshold is None: param.threshold = np.mean(abs(self.beta)) @@ -1566,7 +1561,7 @@ def __init__(self, matrices, param, formulation, numerical_method, label): self.refit = min_LS( matrices, self.selected_param, - intercept = param.formulation.intercept, + intercept=param.formulation.intercept, ) self.label = label self.time = time() - t0 @@ -1582,21 +1577,20 @@ def __repr__(self): else: top = np.arange(d) - plt.figure(figsize = (10, 3), dpi = 80) + plt.figure(figsize=(10, 3), dpi=80) plt.bar(range(len(top)), self.beta[top]), plt.title( LAM_beta["title"] + str(round(self.lam, 3)) ), plt.xlabel(LAM_beta["xlabel"]), plt.ylabel(LAM_beta["ylabel"]) plt.xticks( np.where(self.selected_param[top])[0], self.label[top][self.selected_param[top]], - rotation = 30, + rotation=30, ) - + plt.tight_layout() if type(self.save) == str: plt.savefig(self.save) - - + plt.show(block=False) if self.formulation.concomitant: @@ -1610,7 +1604,7 @@ def __repr__(self): return string -def choose_numerical_method(method, model, formulation, StabSelmethod = None, lam = None): +def choose_numerical_method(method, model, formulation, StabSelmethod=None, lam=None): """Annex function in order to choose the right numerical method, if the given one is invalid. In general, it will choose one of the possible optimization scheme for a given formulation. When several computation modes are possible, the rules are as follow : @@ -1682,7 +1676,7 @@ def choose_numerical_method(method, model, formulation, StabSelmethod = None, la "title": r" ", "xlabel": r"$\lambda / \lambda_{max}$", "ylabel": r"Mean-Squared Error (MSE) ", - "ylabel_classification": r"Miss classification rate " + "ylabel_classification": r"Miss classification rate ", } LAM_beta = { "title": r"Coefficients at $\lambda$ = ", @@ -1719,4 +1713,4 @@ def choose_numerical_method(method, model, formulation, StabSelmethod = None, la "title": r" ", "xlabel": r"$\lambda / \lambda_{max}$", "ylabel": r"Approximation of Leave 1-out error (ALO) ", -} \ No newline at end of file +} diff --git a/classo/stability_selection.py b/classo/stability_selection.py index d762bcc..8f962b9 100644 --- a/classo/stability_selection.py +++ b/classo/stability_selection.py @@ -22,22 +22,22 @@ def stability( matrix, - StabSelmethod = "first", - numerical_method = "Path-Alg", - Nlam = 100, - lamin = 1e-2, - lam = 0.1, - q = 10, - B = 50, - percent_nS = 0.5, - formulation = "R1", - seed = 1, - rho = 1.345, - rho_classification = -1.0, - true_lam = False, - e = 1.0, - w = None, - intercept = False, + StabSelmethod="first", + numerical_method="Path-Alg", + Nlam=100, + lamin=1e-2, + lam=0.1, + q=10, + B=50, + percent_nS=0.5, + formulation="R1", + seed=1, + rho=1.345, + rho_classification=-1.0, + true_lam=False, + e=1.0, + w=None, + intercept=False, ): rd.seed(seed) @@ -60,26 +60,26 @@ def stability( BETA = np.array( pathlasso( submatrix, - lambdas = lambdas, - n_active = q + 1, - lamin = 0, - typ = formulation, - meth = numerical_method, - rho = rho, - rho_classification = rho_classification, - e = e * percent_nS, - w = w, - intercept = intercept, + lambdas=lambdas, + n_active=q + 1, + lamin=0, + typ=formulation, + meth=numerical_method, + rho=rho, + rho_classification=rho_classification, + e=e * percent_nS, + w=w, + intercept=intercept, )[0] ) distr_path = distr_path + (abs(BETA) >= 1e-5) pop = biggest_indexes(BETA[-1], q) - distribution[pop] +=1. + distribution[pop] += 1.0 # to do : output, instead of lambdas, the average aciv """ distr_path(lambda)_i = 1/B number of time where i is (among the q-first & activated before lambda) """ - #distribution = distr_path[-1] + # distribution = distr_path[-1] return (distribution * 1.0 / B, distr_path * 1.0 / B, lambdas) elif StabSelmethod == "lam": @@ -90,14 +90,14 @@ def stability( regress = Classo( submatrix, lam, - typ = formulation, - meth = numerical_method, - rho = rho, - rho_classification = rho_classification, - e = e * percent_nS, - true_lam = true_lam, - w = w, - intercept = intercept, + typ=formulation, + meth=numerical_method, + rho=rho, + rho_classification=rho_classification, + e=e * percent_nS, + true_lam=true_lam, + w=w, + intercept=intercept, ) if type(regress) == tuple: beta = regress[0] @@ -115,17 +115,17 @@ def stability( # compute the path until n_active = q, and only take the last Beta BETA = pathlasso( submatrix, - n_active = 0, - lambdas = lambdas, - typ = formulation, - meth = numerical_method, - rho = rho, - rho_classification = rho_classification, - e = e * percent_nS, - w = w, - intercept = intercept, + n_active=0, + lambdas=lambdas, + typ=formulation, + meth=numerical_method, + rho=rho, + rho_classification=rho_classification, + e=e * percent_nS, + w=w, + intercept=intercept, )[0] - betamax = np.amax(abs(np.array(BETA)), axis = 0) + betamax = np.amax(abs(np.array(BETA)), axis=0) qmax = biggest_indexes(betamax, q) for i in qmax: distribution[i] += 1 @@ -157,7 +157,6 @@ def biggest_indexes(array, q): return qbiggest - # for a certain threshold, it returns the features that should be selected def selected_param(distribution, threshold, threshold_label): selected, to_label = [False] * len(distribution), [False] * len(distribution) diff --git a/classo/tests/test_Classo_R.py b/classo/tests/test_Classo_R.py index c20f5e5..6d029af 100644 --- a/classo/tests/test_Classo_R.py +++ b/classo/tests/test_Classo_R.py @@ -2,12 +2,7 @@ import matplotlib.pyplot as plt from numpy.testing import assert_allclose -from ..compact_func import ( - pathlasso, - Classo -) - - +from ..compact_func import pathlasso, Classo from ..solve_R1 import problem_R1, Classo_R1 @@ -20,11 +15,11 @@ tol = 2e-2 m, d, d_nonzero, k, sigma = 30, 20, 5, 1, 0.5 -matrices, sol = random_data(m, d, d_nonzero, k, sigma, zerosum = True, seed = 42) +matrices, sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=42) X, C, y = matrices -d1 = d//2 -w = np.array( [0.9]*d1 + [1.1]*(d-d1)) +d1 = d // 2 +w = np.array([0.9] * d1 + [1.1] * (d - d1)) """ @@ -33,33 +28,33 @@ def test_Classo_R1_all_methods_match(): - + lam = 0.05 pb = problem_R1(matrices, "Path-Alg") beta_ref = Classo_R1(pb, lam) pb = problem_R1(matrices, "DR") beta = Classo_R1(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol pb = problem_R1(matrices, "P-PDS") beta = Classo_R1(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol - + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol pb = problem_R1(matrices, "PF-PDS") beta = Classo_R1(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + def test_Classo_R1_all_methods_match_no_C(): - matrices2, sol2 = random_data(m, d, d_nonzero, 0, sigma, zerosum = False, seed = 41) + matrices2, sol2 = random_data(m, d, d_nonzero, 0, sigma, zerosum=False, seed=41) lam = 0.05 pb = problem_R1(matrices2, "Path-Alg") @@ -67,42 +62,43 @@ def test_Classo_R1_all_methods_match_no_C(): pb = problem_R1(matrices2, "DR") beta = Classo_R1(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol pb = problem_R1(matrices2, "P-PDS") beta = Classo_R1(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + def test_Classo_R2_all_methods_match(): rho = 1.345 lam = 0.1 - - + pb = problem_R2(matrices, "Path-Alg", rho) beta_ref = Classo_R2(pb, lam) pb = problem_R2(matrices, "DR", rho) beta = Classo_R2(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) pb = problem_R2(matrices, "P-PDS", rho) beta = Classo_R2(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) pb = problem_R2(matrices, "PF-PDS", rho) beta = Classo_R2(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) + def test_Classo_R3_all_methods_match(): - + lam = 0.1 pb = problem_R3(matrices, "Path-Alg") beta_ref, s_ref = Classo_R3(pb, lam) @@ -110,10 +106,10 @@ def test_Classo_R3_all_methods_match(): pb = problem_R3(matrices, "DR") beta, s = Classo_R3(pb, lam) - - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + def test_Classo_R4_all_methods_match(): @@ -125,16 +121,16 @@ def test_Classo_R4_all_methods_match(): pb = problem_R4(matrices, "DR", rho) beta, s = Classo_R4(pb, lam) - print( np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) ) + print(np.sum(abs(beta_ref - beta)) / np.sum(abs(beta_ref))) assert_allclose(beta_ref, beta, rtol=tol, atol=tol) - #assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol + # assert np.sum(abs(beta_ref-beta)) / np.sum(abs(beta_ref)) < tol def test_Classo_lam_null(): lam = 0.0 rho = 1.345 m, d, d_nonzero, k, sigma = 10, 20, 4, 1, 0.5 - matrices, sol = random_data(m, d, d_nonzero, k, sigma, zerosum = True, seed = 42) + matrices, sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=42) X, C, y = matrices pb1 = problem_R1(matrices, "Path-Alg") diff --git a/classo/tests/test_compact_func.py b/classo/tests/test_compact_func.py index 2cd9097..e9ba624 100644 --- a/classo/tests/test_compact_func.py +++ b/classo/tests/test_compact_func.py @@ -1,60 +1,62 @@ import numpy as np from numpy.testing import assert_allclose -from ..compact_func import ( - pathlasso, - Classo -) +from ..compact_func import pathlasso, Classo from ..misc_functions import random_data tol = 1e-2 m, d, d_nonzero, k, sigma = 30, 20, 5, 1, 0.5 -(X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum = True, seed = 42) +(X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=42) -d1 = d//2 -w = np.array( [0.9]*d1 + [1.1]*(d-d1)) +d1 = d // 2 +w = np.array([0.9] * d1 + [1.1] * (d - d1)) """ Test of Classo """ + def test_Classo_R2_is_R1_for_high_rho(): - beta1 = Classo((X, C, y), 0.1, typ = "R1", meth="Path-Alg", intercept=True) + beta1 = Classo((X, C, y), 0.1, typ="R1", meth="Path-Alg", intercept=True) - beta2 = Classo((X, C, y), 0.1, typ = "R2", meth="Path-Alg", intercept=True, rho=100) + beta2 = Classo((X, C, y), 0.1, typ="R2", meth="Path-Alg", intercept=True, rho=100) assert_allclose(beta1, beta2, rtol=tol, atol=tol) + def test_Classo_R2_is_R1_for_high_rho_w(): - _, beta1 = Classo((X, C, y), 0.1, typ = "R1", meth="Path-Alg", get_lambdamax=True, w=w) + _, beta1 = Classo( + (X, C, y), 0.1, typ="R1", meth="Path-Alg", get_lambdamax=True, w=w + ) - beta2 = Classo((X, C, y), 0.1, typ = "R2", meth="Path-Alg", w=w, rho=100) + beta2 = Classo((X, C, y), 0.1, typ="R2", meth="Path-Alg", w=w, rho=100) assert_allclose(beta1, beta2, rtol=tol, atol=tol) def test_Classo_R4_is_R3_for_high_rho(): - beta1 = Classo((X, C, y), 0.1, e=20, typ = "R3", meth="Path-Alg", return_sigm=False) + beta1 = Classo((X, C, y), 0.1, e=20, typ="R3", meth="Path-Alg", return_sigm=False) + + beta2 = Classo( + (X, C, y), 0.1, e=20, typ="R4", meth="Path-Alg", rho=100, return_sigm=False + ) - beta2 = Classo((X, C, y), 0.1, e=20, typ = "R4", meth="Path-Alg", rho=100, return_sigm=False) - assert_allclose(beta1, beta2, rtol=tol, atol=tol) def test_Classo_C2_is_C1_for_high_rho(): - - beta1 = Classo((X, C, np.sign(y)), 0.1, typ = "C1") - beta2 = Classo((X, C, np.sign(y)), 0.1, typ = "C2", rho=-100) + beta1 = Classo((X, C, np.sign(y)), 0.1, typ="C1") - assert_allclose(beta1, beta2, rtol=tol, atol=tol) + beta2 = Classo((X, C, np.sign(y)), 0.1, typ="C2", rho=-100) + assert_allclose(beta1, beta2, rtol=tol, atol=tol) """ @@ -64,22 +66,26 @@ def test_Classo_C2_is_C1_for_high_rho(): def test_pathlasso_R1(): for meth in ["Path-Alg", "DR", "P-PDS", "PF-PDS"]: - aux_test_pathlasso((X,C,y), "R1", meth) + aux_test_pathlasso((X, C, y), "R1", meth) def test_pathlasso_R2(): for meth in ["Path-Alg", "DR", "P-PDS", "PF-PDS"]: - aux_test_pathlasso((X,C,y), "R2", meth, tole=1e-1) + aux_test_pathlasso((X, C, y), "R2", meth, tole=1e-1) + def test_pathlasso_R3(): for meth in ["Path-Alg", "DR"]: - aux_test_pathlasso((X,C,y), "R3", meth, tole=1) + aux_test_pathlasso((X, C, y), "R3", meth, tole=1) + def test_pathlasso_R4(): - aux_test_pathlasso((X,C,y), "R4", "DR") + aux_test_pathlasso((X, C, y), "R4", "DR") + def test_pathlasso_C1(): - aux_test_pathlasso((X,C,np.sign(y)), "C1", "Path-Alg") + aux_test_pathlasso((X, C, np.sign(y)), "C1", "Path-Alg") + def test_pathlasso_C2(): aux_test_pathlasso((X, C, np.sign(y)), "C2", "Path-Alg") @@ -88,20 +94,22 @@ def test_pathlasso_C2(): def aux_test_pathlasso(matrix, typ, meth, tole=tol): betas, lambdas = pathlasso(matrix, typ=typ, meth=meth) - i = len(lambdas)//2 - lamb = lambdas[i] + i = len(lambdas) // 2 + lamb = lambdas[i] beta_1 = betas[i] beta_2 = Classo(matrix, lamb, typ=typ, true_lam=True, return_sigm=False, meth=meth) assert_allclose(beta_1, beta_2, rtol=tole, atol=tole) + def test_pathlasso_R1_nactive(): for meth in ["Path-Alg", "DR", "P-PDS", "PF-PDS"]: betas, lambdas = pathlasso((X, C, y), typ="R1", meth=meth, n_active=5) beta = betas[-1] assert sum(abs(beta) >= 1e-2) <= 6 + def test_pathlasso_R2_nactive(): for meth in ["Path-Alg", "DR", "P-PDS", "PF-PDS"]: betas, lambdas = pathlasso((X, C, y), typ="R2", meth=meth, n_active=5) beta = betas[-1] - assert sum(abs(beta) >= 1e-2) <= 6 \ No newline at end of file + assert sum(abs(beta) >= 1e-2) <= 6 diff --git a/classo/tests/test_cross_validation.py b/classo/tests/test_cross_validation.py index 2672416..7b3e8a2 100644 --- a/classo/tests/test_cross_validation.py +++ b/classo/tests/test_cross_validation.py @@ -10,7 +10,7 @@ average_test, accuracy_func, misclassification_rate, - CV + CV, ) @@ -18,19 +18,18 @@ def test_train_test_CV_non_divisible(): n = 16 k = 3 - result = train_test_CV(n,k) - + result = train_test_CV(n, k) assert len(result) == k assert len(result[0]) == 5 and len(result[1]) == 5 and len(result[2]) == 6 assert set(np.concatenate(result)) == set(range(n)) + def test_train_test_CV_divisible(): n = 15 k = 3 - result = train_test_CV(n,k) - + result = train_test_CV(n, k) assert len(result) == k assert len(result[0]) == 5 and len(result[1]) == 5 and len(result[2]) == 5 @@ -38,7 +37,7 @@ def test_train_test_CV_divisible(): def test_train_test_i(): - value = [[1,0],[3,4],[2,5]] + value = [[1, 0], [3, 4], [2, 5]] i = 1 result1, result2 = train_test_i(value, i) @@ -51,42 +50,42 @@ def test_train_test_i(): assert len(result2) == len(exp2) assert set(exp2) == set(result2) + def test_accuracy_func(): - A = np.eye(5)*2 - y = np.array([2, 5, 2, 4, -1])*2 + 20 - beta = np.array([20, 2, 5, 2, 4,-1]) + np.array([0, 1, 1, 1, 1, 1])/2 + A = np.eye(5) * 2 + y = np.array([2, 5, 2, 4, -1]) * 2 + 20 + beta = np.array([20, 2, 5, 2, 4, -1]) + np.array([0, 1, 1, 1, 1, 1]) / 2 - result = accuracy_func( - A, y, beta, typ="R1", intercept=True - ) + result = accuracy_func(A, y, beta, typ="R1", intercept=True) exp = 1 assert np.isclose(result, exp) + def test_misclassification_rate(): - A = np.diag([2, 5, -0.4, -2., -6., -1]) + A = np.diag([2, 5, -0.4, -2.0, -6.0, -1]) y = np.array([1, -1, 1, -1, 1, -1]) - beta = np.array([0.004, 2.4, -0.1, 1, -0.3, 15.]) + beta = np.array([0.004, 2.4, -0.1, 1, -0.3, 15.0]) result = misclassification_rate(A, y, beta) - exp = 1/6 + exp = 1 / 6 assert np.isclose(result, exp) def test_CV(): - - lambdas= np.array([1., 0.1, 0.01, 0.005, 0.0001]) + + lambdas = np.array([1.0, 0.1, 0.01, 0.005, 0.0001]) y_rand = np.random.randint(20, size=(10, 1)) - X = np.concatenate([np.eye(10)]*10 , axis=0) - y = np.concatenate([y_rand]*10, axis=0)[:, 0] + X = np.concatenate([np.eye(10)] * 10, axis=0) + y = np.concatenate([y_rand] * 10, axis=0)[:, 0] - matrices = (X, np.zeros((1,10)) , y) + matrices = (X, np.zeros((1, 10)), y) out, MSE, SE, i, i_1SE = CV(matrices, 3, intercept=False, lambdas=lambdas) print(y[:10]) print(out, MSE[i]) print(i, i_1SE) - assert i == len(lambdas)-1 + assert i == len(lambdas) - 1 diff --git a/classo/tests/test_misc_func.py b/classo/tests/test_misc_func.py index d043ab1..538230a 100644 --- a/classo/tests/test_misc_func.py +++ b/classo/tests/test_misc_func.py @@ -13,164 +13,191 @@ influence, check_size, random_data, - clr + clr, ) -from ..path_alg import ( - next_idr2, - next_idr1 -) +from ..path_alg import next_idr2, next_idr1 tol = 1e-5 def test_theoretical_lam(): - l1 = theoretical_lam(10,20) - l2 = theoretical_lam(40,100) - l3 = theoretical_lam(67,10) + l1 = theoretical_lam(10, 20) + l2 = theoretical_lam(40, 100) + l3 = theoretical_lam(67, 10) for l in [l1, l2, l3]: assert l <= 1 assert l > 0 + def test_min_LS_no_intercept(): np.random.seed(123) for i in range(5): np.random.rand - A = np.random.randint(10,size=(20, 50)) - C = np.ones((1,50)) - selected = np.random.rand(50)>0.5 - beta = np.random.randint(10,size=50) + A = np.random.randint(10, size=(20, 50)) + C = np.ones((1, 50)) + selected = np.random.rand(50) > 0.5 + beta = np.random.randint(10, size=50) beta = beta - np.mean(beta[selected]) y = A[:, selected].dot(beta[selected]) - result = min_LS((A,C,y), selected, intercept = False) + result = min_LS((A, C, y), selected, intercept=False) assert_allclose(A.dot(result), y, rtol=tol, atol=tol) + def test_min_LS_intercept(): np.random.seed(123) for i in range(5): - A = np.random.randint(10,size=(20, 50)) - C = np.ones((1,50)) - selected = np.random.rand(51)>0.5 - beta = np.random.randint(10,size=50) + A = np.random.randint(10, size=(20, 50)) + C = np.ones((1, 50)) + selected = np.random.rand(51) > 0.5 + beta = np.random.randint(10, size=50) beta = beta - np.mean(beta) - y = A[:, selected[1:]].dot(beta[selected[1:]]) + 10. + y = A[:, selected[1:]].dot(beta[selected[1:]]) + 10.0 - result = min_LS((A,C,y), selected, intercept = True) + result = min_LS((A, C, y), selected, intercept=True) + + assert_allclose(A.dot(result[1:]) + result[0], y, rtol=tol, atol=tol) - assert_allclose(A.dot(result[1:])+result[0], y, rtol=tol, atol=tol) def test_affichage(): nlam = 12 d = 20 - lBeta = np.random.randint(4, size=(nlam, d) ) - path = np.linspace(1., 0., nlam) - labels=['a'+str(i) for i in range(d)] + lBeta = np.random.randint(4, size=(nlam, d)) + path = np.linspace(1.0, 0.0, nlam) + labels = ["a" + str(i) for i in range(d)] affichage(lBeta, path, labels=labels) + def test_affichage_nolab(): nlam = 12 d = 20 - lBeta = np.random.randint(4, size=(nlam, d) ) - path = np.linspace(1., 0., nlam) - labels=['a'+str(i) for i in range(d)] + lBeta = np.random.randint(4, size=(nlam, d)) + path = np.linspace(1.0, 0.0, nlam) + labels = ["a" + str(i) for i in range(d)] affichage(lBeta, path, labels=False) def test_influence(): nlam = 13 - betas = np.array([[1, 0, 3 ,4, -3, 0, 0, 0, -6]]*nlam) + betas = np.array([[1, 0, 3, 4, -3, 0, 0, 0, -6]] * nlam) ntop = 4 exp = np.array([2, 3, 4, 8]) value = influence(betas, ntop) - assert np.all( value == exp ) + assert np.all(value == exp) + def test_check_size_all_is_good(): n, d, k = 30, 68, 5 - X = np.ones((n,d)) + X = np.ones((n, d)) y = np.ones(n) - C = np.ones((k,d)) + C = np.ones((k, d)) Xv, yv, Cv = check_size(X, y, C) - assert np.all( X == Xv ) - assert np.all( y == yv ) - assert np.all( C == Cv ) + assert np.all(X == Xv) + assert np.all(y == yv) + assert np.all(C == Cv) + def test_check_size_wrong(): n, d, k = 30, 68, 5 - X = np.ones((n,d)) + X = np.ones((n, d)) y = np.ones(n) - C = np.zeros((k,d)) - - Cwrong = C[:, :d-3] + C = np.zeros((k, d)) + + Cwrong = C[:, : d - 3] Xv, yv, Cv = check_size(X, y, Cwrong) - assert np.all( X == Xv ) - assert np.all( y == yv ) - assert np.all( C == Cv ) + assert np.all(X == Xv) + assert np.all(y == yv) + assert np.all(C == Cv) + def test_check_size_wrong2(): n, d, k = 45, 49, 2 - Xwrong = np.ones((n+4,d)) + Xwrong = np.ones((n + 4, d)) y = np.ones(n) - Cwrong = np.zeros((k,d+3)) - + Cwrong = np.zeros((k, d + 3)) + X = Xwrong[:n] C = Cwrong[:, :d] Xv, yv, Cv = check_size(Xwrong, y, Cwrong) - assert np.all( X == Xv ) - assert np.all( y == yv ) - assert np.all( C == Cv ) + assert np.all(X == Xv) + assert np.all(y == yv) + assert np.all(C == Cv) + def test_check_size_wrong3(): n, d = 45, 20 - X = np.ones((n,d)) - ywrong = np.ones(n+5) + X = np.ones((n, d)) + ywrong = np.ones(n + 5) C = np.ones((1, d)) y = ywrong[:n] Xv, yv, Cv = check_size(X, ywrong, None) - assert np.all( X == Xv ) - assert np.all( y == yv ) - assert np.all( C == Cv ) + assert np.all(X == Xv) + assert np.all(y == yv) + assert np.all(C == Cv) + def test_random_data_C(): n = 15 d = 20 d_nonzero = 7 k = 6 - sigma = 0. + sigma = 0.0 intercept = 0.2 - + for seed in range(10): - mat, sol = random_data(n, d, d_nonzero, k, sigma, zerosum=False, A=np.eye(d), seed=seed, intercept=intercept) + mat, sol = random_data( + n, + d, + d_nonzero, + k, + sigma, + zerosum=False, + A=np.eye(d), + seed=seed, + intercept=intercept, + ) X, C, y = mat - assert_allclose(C.dot(sol), 0., rtol=tol, atol=tol) + assert_allclose(C.dot(sol), 0.0, rtol=tol, atol=tol) assert_allclose(X.dot(sol) + intercept, y, rtol=tol, atol=tol) assert len(C) == k + def test_random_data_zerosum(): n = 65 d = 62 d_nonzero = 6 k = 0 - sigma = 1. - - mat, sol = random_data(n, d, d_nonzero, k, sigma, zerosum=True, exp=True, classification=True, seed=None) + sigma = 1.0 + + mat, sol = random_data( + n, + d, + d_nonzero, + k, + sigma, + zerosum=True, + exp=True, + classification=True, + seed=None, + ) X, C, y = mat assert set(y).issubset({1, -1}) - assert np.isclose(np.mean(sol), 0., atol=tol) + assert np.isclose(np.mean(sol), 0.0, atol=tol) def test_random_data_C_big_k(): @@ -179,24 +206,23 @@ def test_random_data_C_big_k(): d_nonzero = 10 k = 10 - mat, sol = random_data(n, d, d_nonzero, k, 0., zerosum=False, A=np.eye(d), seed=0) + mat, sol = random_data(n, d, d_nonzero, k, 0.0, zerosum=False, A=np.eye(d), seed=0) X, C, y = mat - assert_allclose(C.dot(sol), 0., rtol=tol, atol=tol) + assert_allclose(C.dot(sol), 0.0, rtol=tol, atol=tol) assert_allclose(X.dot(sol), y, rtol=tol, atol=tol) assert len(C) == k - def test_clr(): X = np.eye(10) Z = clr(X) - assert_allclose(np.mean(Z, axis=0), 0., rtol=tol, atol=tol) + assert_allclose(np.mean(Z, axis=0), 0.0, rtol=tol, atol=tol) def test_next_idr1(): mat = np.eye(5) - l = [True]*5 + l = [True] * 5 exp = 4 l[exp] = False value = next_idr1(l, mat) @@ -205,8 +231,8 @@ def test_next_idr1(): def test_next_idr2(): mat = np.eye(5) - l = [True]*5 + l = [True] * 5 exp = 2 - mat[exp, exp] = 0. - value = next_idr2(l,mat) - assert exp == value \ No newline at end of file + mat[exp, exp] = 0.0 + value = next_idr2(l, mat) + assert exp == value diff --git a/classo/tests/test_solver.py b/classo/tests/test_solver.py index 8a62607..225737e 100644 --- a/classo/tests/test_solver.py +++ b/classo/tests/test_solver.py @@ -9,11 +9,10 @@ # that the solver runs. without any error. - # we create a classo instance for tests. m, d, d_nonzero, k, sigma = 30, 20, 5, 1, 0.5 -(X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum = True, seed = 42) +(X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=42) y = y + 0.2 print(np.nonzero(sol)[0]) close = True @@ -22,9 +21,11 @@ def test_solve_PATH_StabSel_R1(): aux_test_solve_PATH_StabSel((False, False, False), close_window=close) + def test_solve_PATH_StabSel_C1(): aux_test_solve_PATH_StabSel((False, False, True), close_window=close) + def test_solve_PATH_StabSel_R3(): aux_test_solve_PATH_StabSel((True, False, False), close_window=close) @@ -32,12 +33,15 @@ def test_solve_PATH_StabSel_R3(): def test_solve_PATH_CV_R1(): aux_test_solve_PATH_CV((False, False, False), close_window=close) + def test_solve_PATH_CV_R2(): aux_test_solve_PATH_CV((False, True, False), close_window=close) + def test_solve_PATH_CV_R3(): aux_test_solve_PATH_CV((True, False, False), close_window=close) + def test_solve_PATH_CV_C2(): aux_test_solve_PATH_CV((False, True, True), close_window=close) @@ -45,28 +49,31 @@ def test_solve_PATH_CV_C2(): def test_solve_PATH_LAMfixed_R1(): aux_test_solve_PATH_LAMfixed((False, False, False), close_window=close) + def test_solve_PATH_LAMfixed_R3(): aux_test_solve_PATH_LAMfixed((True, False, False), close_window=close) + def test_solve_PATH_LAMfixed_R4(): aux_test_solve_PATH_LAMfixed((True, True, False), close_window=close) - def test_solve_CV_StabSel_R1(): aux_test_solve_CV_StabSel((False, False, False), close_window=close) + def test_solve_CV_StabSel_R3(): aux_test_solve_CV_StabSel((True, False, False), close_window=close) - def test_solve_CV_LAMfixed_R1(): aux_test_solve_CV_LAMfixed((False, False, False), close_window=close) + def test_solve_CV_LAMfixed_R4(): aux_test_solve_CV_LAMfixed((True, True, False), close_window=close) + def test_solve_CV_LAMfixed_C1(): aux_test_solve_CV_LAMfixed((False, False, True), close_window=close) @@ -74,23 +81,24 @@ def test_solve_CV_LAMfixed_C1(): def test_solve_StabSel_LAMfixed_R2(): aux_test_solve_StabSel_LAMfixed((False, False, False), close_window=close) + def test_solve_StabSel_LAMfixed_R2(): aux_test_solve_StabSel_LAMfixed((False, True, False), close_window=close) + def test_solve_StabSel_LAMfixed_C2(): aux_test_solve_StabSel_LAMfixed((False, True, True), close_window=close) - def aux_test_solve_PATH_StabSel(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y - pb = classo_problem(X, yy, C= C, label=['label1', 'label2']) + yy = y + pb = classo_problem(X, yy, C=C, label=["label1", "label2"]) pb.formulation.intercept = False - pb.formulation.w = np.array([1.1]*(d//3)+ [1.]*(d-d//3)) - #pb.formulation.rho = 10. + pb.formulation.w = np.array([1.1] * (d // 3) + [1.0] * (d - d // 3)) + # pb.formulation.rho = 10. pb.formulation.concomitant = mode[0] pb.formulation.huber = mode[1] @@ -98,7 +106,7 @@ def aux_test_solve_PATH_StabSel(mode, close_window=True): pb.model_selection.PATH = True pb.model_selection.CV = False - pb.model_selection.StabSel = True + pb.model_selection.StabSel = True pb.model_selection.LAMfixed = False param = pb.model_selection.PATHparameters @@ -109,31 +117,32 @@ def aux_test_solve_PATH_StabSel(mode, close_window=True): param = pb.model_selection.StabSelparameters param.seed = 42 - param.method = 'first' + param.method = "first" param.B = 5 param.q = 5 param.percent_nS = 0.5 param.lamin = 0.01 param.hd = False - param.lam = 'theoretical' + param.lam = "theoretical" param.rescaled_lam = False - param.threshold = 0.7 + param.threshold = 0.7 param.threshold_label = 0.2 print(pb) pb.solve() print(pb, pb.solution) - + if close_window: plt.close("all") - + + def aux_test_solve_PATH_CV(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y - pb = classo_problem(X, yy, C= C) + yy = y + pb = classo_problem(X, yy, C=C) pb.formulation.intercept = True pb.formulation.concomitant = mode[0] @@ -142,12 +151,12 @@ def aux_test_solve_PATH_CV(mode, close_window=True): pb.model_selection.PATH = True pb.model_selection.CV = True - pb.model_selection.StabSel = False + pb.model_selection.StabSel = False pb.model_selection.LAMfixed = False param = pb.model_selection.PATHparameters param.n_active = 0 - param.lambdas = np.linspace(10., 1e-1, 10) + param.lambdas = np.linspace(10.0, 1e-1, 10) param.rescaled_lam = False param = pb.model_selection.CVparameters @@ -165,12 +174,13 @@ def aux_test_solve_PATH_CV(mode, close_window=True): if close_window: plt.close("all") + def aux_test_solve_PATH_LAMfixed(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y - pb = classo_problem(X, yy, C= C) + yy = y + pb = classo_problem(X, yy, C=C) pb.formulation.intercept = False pb.formulation.concomitant = mode[0] @@ -181,7 +191,7 @@ def aux_test_solve_PATH_LAMfixed(mode, close_window=True): pb.model_selection.PATH = True pb.model_selection.CV = False - pb.model_selection.StabSel = False + pb.model_selection.StabSel = False pb.model_selection.LAMfixed = True param = pb.model_selection.PATHparameters @@ -191,7 +201,7 @@ def aux_test_solve_PATH_LAMfixed(mode, close_window=True): param.logscale = False param = pb.model_selection.LAMfixedparameters - param.lam = 'theoretical' + param.lam = "theoretical" param.rescaled_lam = False print(pb) pb.solve() @@ -201,12 +211,13 @@ def aux_test_solve_PATH_LAMfixed(mode, close_window=True): if close_window: plt.close("all") + def aux_test_solve_CV_StabSel(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y - pb = classo_problem(X, yy, C= C) + yy = y + pb = classo_problem(X, yy, C=C) pb.formulation.intercept = False pb.formulation.concomitant = mode[0] @@ -215,7 +226,7 @@ def aux_test_solve_CV_StabSel(mode, close_window=True): pb.model_selection.PATH = False pb.model_selection.CV = True - pb.model_selection.StabSel = True + pb.model_selection.StabSel = True pb.model_selection.LAMfixed = False param = pb.model_selection.CVparameters @@ -228,34 +239,35 @@ def aux_test_solve_CV_StabSel(mode, close_window=True): param = pb.model_selection.StabSelparameters param.seed = None - param.method = 'max' + param.method = "max" param.B = 70 param.q = 20 param.percent_nS = 0.8 param.lamin = 0.01 param.hd = False - param.lam = 'theoretical' + param.lam = "theoretical" param.rescaled_lam = False - param.threshold = 0.5 + param.threshold = 0.5 param.threshold_label = 0.2 print(pb) pb.solve() print(pb, pb.solution) - pb.solution.CV.graphic(se_max = 5) + pb.solution.CV.graphic(se_max=5) if close_window: plt.close("all") + def aux_test_solve_CV_LAMfixed(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y + yy = y - pb = classo_problem(X, yy, C= C) + pb = classo_problem(X, yy, C=C) pb.formulation.intercept = True - pb.formulation.w = np.array([1.1]*20+ [1.]*(d-20)) + pb.formulation.w = np.array([1.1] * 20 + [1.0] * (d - 20)) pb.formulation.concomitant = mode[0] pb.formulation.huber = mode[1] @@ -263,14 +275,14 @@ def aux_test_solve_CV_LAMfixed(mode, close_window=True): pb.model_selection.PATH = False pb.model_selection.CV = True - pb.model_selection.StabSel = False + pb.model_selection.StabSel = False pb.model_selection.LAMfixed = True param = pb.model_selection.CVparameters param.seed = 2 param.Nsubset = 3 param.oneSE = True - param.lambdas = np.linspace(1., 1e-1, 20) + param.lambdas = np.linspace(1.0, 1e-1, 20) param = pb.model_selection.LAMfixedparameters param.lam = 0.1 @@ -283,13 +295,14 @@ def aux_test_solve_CV_LAMfixed(mode, close_window=True): if close_window: plt.close("all") + def aux_test_solve_StabSel_LAMfixed(mode, close_window=True): if mode[2]: yy = np.sign(y) else: - yy = y + yy = y - pb = classo_problem(X, yy, C= C) + pb = classo_problem(X, yy, C=C) pb.formulation.intercept = True pb.formulation.concomitant = mode[0] @@ -298,24 +311,24 @@ def aux_test_solve_StabSel_LAMfixed(mode, close_window=True): pb.model_selection.PATH = False pb.model_selection.CV = False - pb.model_selection.StabSel = True + pb.model_selection.StabSel = True pb.model_selection.LAMfixed = True param = pb.model_selection.StabSelparameters param.seed = None - param.method = 'lam' + param.method = "lam" param.B = 50 param.q = 1000 param.percent_nS = 0.4 - param.lamin = 1. + param.lamin = 1.0 param.hd = False - param.lam = 'theoretical' + param.lam = "theoretical" param.rescaled_lam = False param.threshold = 0.8 param.threshold_label = 0.2 param = pb.model_selection.LAMfixedparameters - param.lam = 0. + param.lam = 0.0 param.rescaled_lam = False print(pb) pb.solve() @@ -326,20 +339,19 @@ def aux_test_solve_StabSel_LAMfixed(mode, close_window=True): plt.close("all") - - def test_choose_numerical_method_R4DR(): formulation = Formulation() formulation.huber = True - for model in ['PATH', 'StabSel']: - value = choose_numerical_method('DR', model, formulation) - assert value == 'DR' + for model in ["PATH", "StabSel"]: + value = choose_numerical_method("DR", model, formulation) + assert value == "DR" + def test_choose_numerical_method_2(): formulation = Formulation() - + for meth in ["Path-Alg", "DR", "P-PDS", "PF-PDS"]: - for model in ['PATH', 'StabSel']: + for model in ["PATH", "StabSel"]: formulation.concomitant = False formulation.classification = False formulation.huber = False @@ -350,4 +362,4 @@ def test_choose_numerical_method_2(): assert value == meth formulation.classification = True value = choose_numerical_method(meth, model, formulation) - assert value == 'Path-Alg' \ No newline at end of file + assert value == "Path-Alg" diff --git a/classo/tests/test_stability_selection.py b/classo/tests/test_stability_selection.py index 2bfb931..16debd4 100644 --- a/classo/tests/test_stability_selection.py +++ b/classo/tests/test_stability_selection.py @@ -11,9 +11,10 @@ biggest_indexes, selected_param, build_submatrix, - build_subset + build_subset, ) + def test_biggest_indexes_empty(): value = np.array([], dtype=float) exp = [] @@ -23,6 +24,7 @@ def test_biggest_indexes_empty(): assert set(exp) == set(result) assert len(exp) == len(result) + def test_biggest_indexes_empty_big_q(): value = np.array([], dtype=float) exp = [] @@ -32,6 +34,7 @@ def test_biggest_indexes_empty_big_q(): assert set(exp) == set(result) assert len(exp) == len(result) + def test_biggest_indexes_len_less_than_q(): value = np.array([4, -1, 2], dtype=float) exp = [0, 1, 2] @@ -41,6 +44,7 @@ def test_biggest_indexes_len_less_than_q(): assert set(exp) == set(result) assert len(exp) == len(result) + def test_biggest_indexes_len_more_than_q(): value = np.array([4, -1, 2, 0, -6, 10], dtype=float) exp = [5, 4, 0] @@ -50,6 +54,7 @@ def test_biggest_indexes_len_more_than_q(): assert set(exp) == set(result) assert len(exp) == len(result) + def test_biggest_indexes_negative_q(): value = np.array([1, 2, 3], dtype=float) exp = [] @@ -59,6 +64,7 @@ def test_biggest_indexes_negative_q(): assert set(exp) == set(result) assert len(exp) == len(result) + def test_build_subset(): value_n = 20 value_nS = 5 @@ -69,21 +75,23 @@ def test_build_subset(): for i in result: assert i in range(value_n) + def test_build_submatrix(): - matrix = (np.array([[1,3,2],[5,6,-2]]), np.array([3,6,0]), np.array([1,2])) + matrix = (np.array([[1, 3, 2], [5, 6, -2]]), np.array([3, 6, 0]), np.array([1, 2])) subset = np.array([1]) result = build_submatrix(matrix, subset) - exp = (np.array([[5,6,-2]]), np.array([3,6,0]), np.array([2])) + exp = (np.array([[5, 6, -2]]), np.array([3, 6, 0]), np.array([2])) + + assert np.all(result[0] == exp[0]) + assert np.all(result[1] == exp[1]) + assert np.all(result[2] == exp[2]) - assert np.all(result[0] == exp[0] ) - assert np.all(result[1] == exp[1] ) - assert np.all(result[2] == exp[2] ) def test_selected_param(): - distribution = np.array([2., 0.1, 7., 14]) - threshold = 10. + distribution = np.array([2.0, 0.1, 7.0, 14]) + threshold = 10.0 threshold_label = 0.3 result1, result2 = selected_param(distribution, threshold, threshold_label) @@ -94,89 +102,98 @@ def test_selected_param(): assert np.all(result1 == exp1) assert np.all(result2 == exp2) + def test_stability_lam_R1_parameters_independance_and_seed_dependance(): - A = np.ones((10,30))+np.arange(-15,15)+np.arange(-5,5)[:, np.newaxis] - C = np.zeros((2,30)) - y = np.arange(10) + A = np.ones((10, 30)) + np.arange(-15, 15) + np.arange(-5, 5)[:, np.newaxis] + C = np.zeros((2, 30)) + y = np.arange(10) matrix = (A, C, y) - result1 = stability( matrix, - StabSelmethod = "lam", - q = 3, - B = 20, - lam = 0.01, - percent_nS = 0.2, - formulation = "R1", - seed = 14, - rho = 6.7, - rho_classification = -26.0, - true_lam = False, - e = 24.0) - - result2 = stability( matrix, - StabSelmethod = "lam", - q = 3, - B = 20, - lam = 0.01, - percent_nS = 0.2, - formulation = "R1", - seed = 14, - rho = 1.2345, - rho_classification = -3.0, - true_lam = False, - e = 3.0) - + result1 = stability( + matrix, + StabSelmethod="lam", + q=3, + B=20, + lam=0.01, + percent_nS=0.2, + formulation="R1", + seed=14, + rho=6.7, + rho_classification=-26.0, + true_lam=False, + e=24.0, + ) + + result2 = stability( + matrix, + StabSelmethod="lam", + q=3, + B=20, + lam=0.01, + percent_nS=0.2, + formulation="R1", + seed=14, + rho=1.2345, + rho_classification=-3.0, + true_lam=False, + e=3.0, + ) print(result1) print(result2) assert np.all(result1 == result2) + def test_stability_max_R2_between_0_and_1(): - A = np.ones((10,30))+np.arange(-15,15)+np.arange(-5,5)[:, np.newaxis] - C = np.zeros((2,30)) - y = np.arange(10) + A = np.ones((10, 30)) + np.arange(-15, 15) + np.arange(-5, 5)[:, np.newaxis] + C = np.zeros((2, 30)) + y = np.arange(10) matrix = (A, C, y) - result = stability( matrix, - StabSelmethod = "max", - numerical_method = "DR", - q = 3, - B = 20, - percent_nS = 0.2, - formulation = "R2", - seed = 24, - rho = 1.5, - rho_classification = -26.0, - true_lam = True, - e = 24.0) - + result = stability( + matrix, + StabSelmethod="max", + numerical_method="DR", + q=3, + B=20, + percent_nS=0.2, + formulation="R2", + seed=24, + rho=1.5, + rho_classification=-26.0, + true_lam=True, + e=24.0, + ) + + assert np.all(result <= 1.0) + assert np.all(result >= 0.0) - assert np.all(result <= 1.) - assert np.all(result >= 0.) def test_stability_first_C1_not_too_high_distribution(): - A = np.ones((25,30)) - A = A + np.arange(-15,15) - A = A + np.arange(-10,15)[:, np.newaxis] - C = np.zeros((2,30)) - y = np.array([1, -1, 1, 1, -1]*5) + A = np.ones((25, 30)) + A = A + np.arange(-15, 15) + A = A + np.arange(-10, 15)[:, np.newaxis] + C = np.zeros((2, 30)) + y = np.array([1, -1, 1, 1, -1] * 5) matrix = (A, C, y) q = 5 - result, _, _ = stability( matrix, - StabSelmethod = "first", - numerical_method = "P-PDS", - q = q, - B = 10, - percent_nS = 0.4, - formulation = "C1", - seed = 24, - rho = 6.7, - rho_classification = -26.0, - true_lam = True, - e = 24.0) - - assert np.sum(result) <= q \ No newline at end of file + result, _, _ = stability( + matrix, + StabSelmethod="first", + numerical_method="P-PDS", + q=q, + B=10, + percent_nS=0.4, + formulation="C1", + seed=24, + rho=6.7, + rho_classification=-26.0, + true_lam=True, + e=24.0, + ) + + assert np.sum(result) <= q diff --git a/docs/source/auto_examples/plot_CentralParkSoil.py b/docs/source/auto_examples/plot_CentralParkSoil.py index cd22295..8085ef7 100644 --- a/docs/source/auto_examples/plot_CentralParkSoil.py +++ b/docs/source/auto_examples/plot_CentralParkSoil.py @@ -19,13 +19,13 @@ # Load data # ^^^^^^^^^^^^^^^^^^^ -data = np.load('CentralParkSoil/cps.npz') +data = np.load("CentralParkSoil/cps.npz") x = data["x"] label = data["label"] y = data["y"] -A = np.load('CentralParkSoil/A.npy') +A = np.load("CentralParkSoil/A.npy") # %% # Preprocess: taxonomy aggregation @@ -34,34 +34,36 @@ label_short = np.array([l.split("::")[-1] for l in label]) pseudo_count = 1 -X = np.log(pseudo_count+x) -nleaves = np.sum(A,axis = 0) -logGeom = X.dot(A)/nleaves +X = np.log(pseudo_count + x) +nleaves = np.sum(A, axis=0) +logGeom = X.dot(A) / nleaves -n,d = logGeom.shape +n, d = logGeom.shape -tr = np.random.permutation(n)[:int(0.8*n)] +tr = np.random.permutation(n)[: int(0.8 * n)] # %% # Cross validation and Path Computation # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.StabSel = False -problem.model_selection.PATH = True -problem.model_selection.CV = True -problem.model_selection.CVparameters.seed = 6 # one could change logscale, Nsubset, oneSE +problem.model_selection.StabSel = False +problem.model_selection.PATH = True +problem.model_selection.CV = True +problem.model_selection.CVparameters.seed = ( + 6 # one could change logscale, Nsubset, oneSE +) print(problem) problem.solve() print(problem.solution) -selection = problem.solution.CV.selected_param[1:] # exclude the intercept +selection = problem.solution.CV.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -70,34 +72,34 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.CV.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2], [M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() plt.tight_layout() # %% # Stability selection # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.PATH = False -problem.model_selection.CV = False +problem.model_selection.PATH = False +problem.model_selection.CV = False # can change q, B, nS, method, threshold etc in problem.model_selection.StabSelparameters problem.solve() print(problem, problem.solution) -selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept +selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -106,10 +108,10 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.StabSel.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2],[M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() -plt.tight_layout() \ No newline at end of file +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() +plt.tight_layout() diff --git a/docs/source/auto_examples/plot_Tara_example.py b/docs/source/auto_examples/plot_Tara_example.py index 77b30fb..7d930ad 100644 --- a/docs/source/auto_examples/plot_Tara_example.py +++ b/docs/source/auto_examples/plot_Tara_example.py @@ -35,14 +35,14 @@ # Load data # ^^^^^^^^^^^^^^^^^^^ -data = np.load('Tara/tara.npz') +data = np.load("Tara/tara.npz") x = data["x"] label = data["label"] y = data["y"] tr = data["tr"] -A = np.load('Tara/A.npy') +A = np.load("Tara/A.npy") # %% # Preprocess: taxonomy aggregation @@ -51,33 +51,34 @@ label_short = np.array([l.split("::")[-1] for l in label]) pseudo_count = 1 -X = np.log(pseudo_count+x) -nleaves = np.sum(A,axis = 0) -logGeom = X.dot(A)/nleaves - +X = np.log(pseudo_count + x) +nleaves = np.sum(A, axis=0) +logGeom = X.dot(A) / nleaves # %% # Cross validation and Path Computation # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.StabSel = False -problem.model_selection.PATH = True -problem.model_selection.CV = True -problem.model_selection.CVparameters.seed = 6 # one could change logscale, Nsubset, oneSE +problem.model_selection.StabSel = False +problem.model_selection.PATH = True +problem.model_selection.CV = True +problem.model_selection.CVparameters.seed = ( + 6 # one could change logscale, Nsubset, oneSE +) print(problem) problem.solve() print(problem.solution) -selection = problem.solution.CV.selected_param[1:] # exclude the intercept +selection = problem.solution.CV.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -86,34 +87,34 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.CV.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2], [M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() plt.tight_layout() # %% # Stability selection # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.PATH = False -problem.model_selection.CV = False +problem.model_selection.PATH = False +problem.model_selection.CV = False # can change q, B, nS, method, threshold etc in problem.model_selection.StabSelparameters problem.solve() print(problem, problem.solution) -selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept +selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -122,10 +123,10 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.StabSel.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2],[M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() -plt.tight_layout() \ No newline at end of file +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() +plt.tight_layout() diff --git a/docs/source/auto_examples/plot_advanced_example.py b/docs/source/auto_examples/plot_advanced_example.py index 87ebdc2..379a4e0 100644 --- a/docs/source/auto_examples/plot_advanced_example.py +++ b/docs/source/auto_examples/plot_advanced_example.py @@ -13,12 +13,12 @@ # %% # Generate the data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # This code snippet generates a problem instance with sparse ß in dimension # d=100 (sparsity d_nonzero=5). The design matrix X comprises n=100 samples generated from an i.i.d standard normal -# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. +# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. # The input `zerosum=True` implies that C is the all-ones vector and Cß=0. The n-dimensional outcome vector y -# and the regression vector ß is then generated to satisfy the given constraints. +# and the regression vector ß is then generated to satisfy the given constraints. # One can then see the parameters that should be selected. m, d, d_nonzero, k, sigma = 100, 200, 5, 1, 0.5 @@ -28,31 +28,31 @@ # %% # Define the classo instance # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Next we can define a default c-lasso problem instance with the generated data: -problem = classo_problem(X, y, C) +problem = classo_problem(X, y, C) # %% # Change the parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Let's see some example of change in the parameters -problem.formulation.huber = True -problem.formulation.concomitant = False -problem.model_selection.CV = True -problem.model_selection.LAMfixed = True -problem.model_selection.PATH = True -problem.model_selection.StabSelparameters.method = 'max' +problem.formulation.huber = True +problem.formulation.concomitant = False +problem.model_selection.CV = True +problem.model_selection.LAMfixed = True +problem.model_selection.PATH = True +problem.model_selection.StabSelparameters.method = "max" problem.model_selection.CVparameters.seed = 1 problem.model_selection.LAMfixedparameters.rescaled_lam = True -problem.model_selection.LAMfixedparameters.lam = .1 +problem.model_selection.LAMfixedparameters.lam = 0.1 # %% # Check parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # You can look at the generated problem instance by typing: print(problem) @@ -60,9 +60,9 @@ # %% # Solve optimization problems # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We only use stability selection as default model selection strategy. -# The command also allows you to inspect the computed stability profile for all variables +# +# We only use stability selection as default model selection strategy. +# The command also allows you to inspect the computed stability profile for all variables # at the theoretical λ problem.solve() @@ -70,8 +70,8 @@ # %% # Visualisation # ^^^^^^^^^^^^^^^ -# -# After completion, the results of the optimization and model selection routines +# +# After completion, the results of the optimization and model selection routines # can be visualized using -print(problem.solution) \ No newline at end of file +print(problem.solution) diff --git a/docs/source/auto_examples/plot_basic_example.py b/docs/source/auto_examples/plot_basic_example.py index a6ac38d..0fd48c9 100644 --- a/docs/source/auto_examples/plot_basic_example.py +++ b/docs/source/auto_examples/plot_basic_example.py @@ -12,12 +12,12 @@ # %% # Generate the data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # This code snippet generates a problem instance with sparse ß in dimension # d=100 (sparsity d_nonzero=5). The design matrix X comprises n=100 samples generated from an i.i.d standard normal -# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. +# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. # The input `zerosum=True` implies that C is the all-ones vector and Cß=0. The n-dimensional outcome vector y -# and the regression vector ß is then generated to satisfy the given constraints. +# and the regression vector ß is then generated to satisfy the given constraints. m, d, d_nonzero, k, sigma = 100, 200, 5, 1, 0.5 (X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=1) @@ -30,15 +30,15 @@ # %% # Define the classo instance # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Next we can define a default c-lasso problem instance with the generated data: -problem = classo_problem(X, y, C) +problem = classo_problem(X, y, C) # %% # Check parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # You can look at the generated problem instance by typing: print(problem) @@ -46,9 +46,9 @@ # %% # Solve optimization problems # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We only use stability selection as default model selection strategy. -# The command also allows you to inspect the computed stability profile for all variables +# +# We only use stability selection as default model selection strategy. +# The command also allows you to inspect the computed stability profile for all variables # at the theoretical λ problem.solve() @@ -56,8 +56,8 @@ # %% # Visualisation # ^^^^^^^^^^^^^^^ -# -# After completion, the results of the optimization and model selection routines +# +# After completion, the results of the optimization and model selection routines # can be visualized using -print(problem.solution) \ No newline at end of file +print(problem.solution) diff --git a/docs/source/auto_examples/plot_combo_example.py b/docs/source/auto_examples/plot_combo_example.py index c876312..6512c4d 100644 --- a/docs/source/auto_examples/plot_combo_example.py +++ b/docs/source/auto_examples/plot_combo_example.py @@ -14,46 +14,46 @@ # Load microbiome and covariate data X # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -X0 = csv_to_np('COMBO_data/complete_data/GeneraCounts.csv', begin = 0).astype(float) -X_C = csv_to_np('COMBO_data/CaloriData.csv', begin = 0).astype(float) -X_F = csv_to_np('COMBO_data/FatData.csv', begin = 0).astype(float) +X0 = csv_to_np("COMBO_data/complete_data/GeneraCounts.csv", begin=0).astype(float) +X_C = csv_to_np("COMBO_data/CaloriData.csv", begin=0).astype(float) +X_F = csv_to_np("COMBO_data/FatData.csv", begin=0).astype(float) # %% # Load BMI measurements y # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -y = csv_to_np('COMBO_data/BMI.csv', begin = 0).astype(float)[:, 0] -labels = csv_to_np('COMBO_data/complete_data/GeneraPhylo.csv').astype(str)[:, -1] +y = csv_to_np("COMBO_data/BMI.csv", begin=0).astype(float)[:, 0] +labels = csv_to_np("COMBO_data/complete_data/GeneraPhylo.csv").astype(str)[:, -1] # %% # Normalize/transform data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -y = y - np.mean(y) #BMI data (n = 96) -X_C = X_C - np.mean(X_C, axis = 0) #Covariate data (Calorie) -X_F = X_F - np.mean(X_F, axis = 0) #Covariate data (Fat) +y = y - np.mean(y) # BMI data (n = 96) +X_C = X_C - np.mean(X_C, axis=0) # Covariate data (Calorie) +X_F = X_F - np.mean(X_F, axis=0) # Covariate data (Fat) X0 = clr(X0, 1 / 2).T # %% # Set up design matrix and zero-sum constraints for 45 genera # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -X = np.concatenate((X0, X_C, X_F, np.ones((len(X0), 1))), axis = 1) # Joint microbiome and covariate data and offset -label = np.concatenate([labels, np.array(['Calorie', 'Fat', 'Bias'])]) +X = np.concatenate( + (X0, X_C, X_F, np.ones((len(X0), 1))), axis=1 +) # Joint microbiome and covariate data and offset +label = np.concatenate([labels, np.array(["Calorie", "Fat", "Bias"])]) C = np.ones((1, len(X[0]))) -C[0, -1], C[0, -2], C[0, -3] = 0., 0., 0. - - +C[0, -1], C[0, -2], C[0, -3] = 0.0, 0.0, 0.0 # %% # Set up c-lassso problem # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(X, y, C, label = label) +problem = classo_problem(X, y, C, label=label) # %% # Use stability selection with theoretical lambda [Combettes & Müller, 2020b] -problem.model_selection.StabSelparameters.method = 'lam' +problem.model_selection.StabSelparameters.method = "lam" problem.model_selection.StabSelparameters.threshold_label = 0.5 # %% diff --git a/docs/source/conf.py b/docs/source/conf.py index ac0cdb3..318c183 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,7 +14,7 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) -import sphinx_gallery +import sphinx_gallery import sys import warnings from classo import __version__ as version @@ -22,9 +22,9 @@ # -- Project information ----------------------------------------------------- -project = 'classo' -copyright = '2020, Leo Simpson' -author = 'Leo Simpson' +project = "classo" +copyright = "2020, Leo Simpson" +author = "Leo Simpson" # The full version, including alpha/beta/rc tags release = version @@ -35,17 +35,20 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosummary', - 'sphinx_gallery.gen_gallery',] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "sphinx.ext.autosummary", + "sphinx_gallery.gen_gallery", +] autosummary_generate = True + def skip(app, what, name, obj, would_skip, options): if name == "__init__" or name == "__new__": return False @@ -55,49 +58,50 @@ def skip(app, what, name, obj, would_skip, options): def setup(app): app.connect("autodoc-skip-member", skip) + # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', '.DS_Store'] +exclude_patterns = ["_build", ".DS_Store"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), - 'matplotlib': ('https://matplotlib.org/', None) + "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "matplotlib": ("https://matplotlib.org/", None), } sphinx_gallery_conf = { - 'backreferences_dir': 'backreferences', - 'doc_module': ('classo', 'numpy', 'py2r', 'matplotlib'), - 'examples_dirs': '../../examples', - 'ignore_pattern': r'/example_', - 'gallery_dirs': 'auto_examples', - 'plot_gallery': 'False', + "backreferences_dir": "backreferences", + "doc_module": ("classo", "numpy", "py2r", "matplotlib"), + "examples_dirs": "../../examples", + "ignore_pattern": r"/example_", + "gallery_dirs": "auto_examples", + "plot_gallery": "False", } diff --git a/examples/example_COMBO.py b/examples/example_COMBO.py index 5d62d14..a86a03b 100755 --- a/examples/example_COMBO.py +++ b/examples/example_COMBO.py @@ -3,7 +3,8 @@ import pandas as pd import numpy as np -def csv_to_np(file, begin = 1, header = None): + +def csv_to_np(file, begin=1, header=None): """Function to read a csv file and to create an ndarray with this Args: @@ -14,7 +15,7 @@ def csv_to_np(file, begin = 1, header = None): Returns: ndarray : matrix of the csv file """ - tab1 = pd.read_csv(file, header = header) + tab1 = pd.read_csv(file, header=header) return np.array(tab1)[:, begin:] @@ -36,7 +37,7 @@ def csv_to_np(file, begin = 1, header = None): C = np.ones((1, len(X[0]))) C[0, -1], C[0, -2], C[0, -3] = 0.0, 0.0, 0.0 -problem = classo_problem(X, y, C, label = label) +problem = classo_problem(X, y, C, label=label) # Solve the problem for a fixed lambda (by default, it will use the theoritical lambda) problem.model_selection.LAMfixed = True diff --git a/examples/example_gneiss.py b/examples/example_gneiss.py index 0f706ce..cdf6823 100755 --- a/examples/example_gneiss.py +++ b/examples/example_gneiss.py @@ -23,14 +23,15 @@ import pandas as pd - # %% # Load data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -t = pd.read_csv('pH_data/qiime2/news/table.csv', index_col=0) -metadata = pd.read_table('pH_data/qiime2/originals/88soils_modified_metadata.txt', index_col=0) +t = pd.read_csv("pH_data/qiime2/news/table.csv", index_col=0) +metadata = pd.read_table( + "pH_data/qiime2/originals/88soils_modified_metadata.txt", index_col=0 +) y_uncent = metadata["ph"].values @@ -38,8 +39,6 @@ label = t.columns - - # second option to load the data # import scipy.io as sio # pH = sio.loadmat("pH_data/matlab/pHData.mat") @@ -55,9 +54,9 @@ # Set up c-lassso problem # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(X, y, label = label) +problem = classo_problem(X, y, label=label) -problem.model_selection.StabSelparameters.method = 'lam' +problem.model_selection.StabSelparameters.method = "lam" problem.model_selection.PATH = True problem.model_selection.LAMfixed = True problem.model_selection.PATHparameters.n_active = X.shape[1] + 1 @@ -89,16 +88,14 @@ # %% # Solve for R4 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Remark : we reset the numerical method here, +# Remark : we reset the numerical method here, # because it has been automatically set to '¨Path-Alg' # for previous computations, but for R4, "DR" is much better # as explained in the documentation, R4 "Path-Alg" is a method for fixed lambda -# but is (paradoxically) bad to compute the lambda-path +# but is (paradoxically) bad to compute the lambda-path # because of the absence of possible warm-start in this method problem.model_selection.PATHparameters.numerical_method = "DR" problem.formulation.huber = True problem.solve() print(problem, problem.solution) - - diff --git a/examples/plot_CentralParkSoil.py b/examples/plot_CentralParkSoil.py index cd22295..8085ef7 100644 --- a/examples/plot_CentralParkSoil.py +++ b/examples/plot_CentralParkSoil.py @@ -19,13 +19,13 @@ # Load data # ^^^^^^^^^^^^^^^^^^^ -data = np.load('CentralParkSoil/cps.npz') +data = np.load("CentralParkSoil/cps.npz") x = data["x"] label = data["label"] y = data["y"] -A = np.load('CentralParkSoil/A.npy') +A = np.load("CentralParkSoil/A.npy") # %% # Preprocess: taxonomy aggregation @@ -34,34 +34,36 @@ label_short = np.array([l.split("::")[-1] for l in label]) pseudo_count = 1 -X = np.log(pseudo_count+x) -nleaves = np.sum(A,axis = 0) -logGeom = X.dot(A)/nleaves +X = np.log(pseudo_count + x) +nleaves = np.sum(A, axis=0) +logGeom = X.dot(A) / nleaves -n,d = logGeom.shape +n, d = logGeom.shape -tr = np.random.permutation(n)[:int(0.8*n)] +tr = np.random.permutation(n)[: int(0.8 * n)] # %% # Cross validation and Path Computation # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.StabSel = False -problem.model_selection.PATH = True -problem.model_selection.CV = True -problem.model_selection.CVparameters.seed = 6 # one could change logscale, Nsubset, oneSE +problem.model_selection.StabSel = False +problem.model_selection.PATH = True +problem.model_selection.CV = True +problem.model_selection.CVparameters.seed = ( + 6 # one could change logscale, Nsubset, oneSE +) print(problem) problem.solve() print(problem.solution) -selection = problem.solution.CV.selected_param[1:] # exclude the intercept +selection = problem.solution.CV.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -70,34 +72,34 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.CV.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2], [M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() plt.tight_layout() # %% # Stability selection # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.PATH = False -problem.model_selection.CV = False +problem.model_selection.PATH = False +problem.model_selection.CV = False # can change q, B, nS, method, threshold etc in problem.model_selection.StabSelparameters problem.solve() print(problem, problem.solution) -selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept +selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -106,10 +108,10 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.StabSel.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2],[M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() -plt.tight_layout() \ No newline at end of file +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() +plt.tight_layout() diff --git a/examples/plot_Tara_example.py b/examples/plot_Tara_example.py index 77b30fb..7d930ad 100755 --- a/examples/plot_Tara_example.py +++ b/examples/plot_Tara_example.py @@ -35,14 +35,14 @@ # Load data # ^^^^^^^^^^^^^^^^^^^ -data = np.load('Tara/tara.npz') +data = np.load("Tara/tara.npz") x = data["x"] label = data["label"] y = data["y"] tr = data["tr"] -A = np.load('Tara/A.npy') +A = np.load("Tara/A.npy") # %% # Preprocess: taxonomy aggregation @@ -51,33 +51,34 @@ label_short = np.array([l.split("::")[-1] for l in label]) pseudo_count = 1 -X = np.log(pseudo_count+x) -nleaves = np.sum(A,axis = 0) -logGeom = X.dot(A)/nleaves - +X = np.log(pseudo_count + x) +nleaves = np.sum(A, axis=0) +logGeom = X.dot(A) / nleaves # %% # Cross validation and Path Computation # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.StabSel = False -problem.model_selection.PATH = True -problem.model_selection.CV = True -problem.model_selection.CVparameters.seed = 6 # one could change logscale, Nsubset, oneSE +problem.model_selection.StabSel = False +problem.model_selection.PATH = True +problem.model_selection.CV = True +problem.model_selection.CVparameters.seed = ( + 6 # one could change logscale, Nsubset, oneSE +) print(problem) problem.solve() print(problem.solution) -selection = problem.solution.CV.selected_param[1:] # exclude the intercept +selection = problem.solution.CV.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -86,34 +87,34 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.CV.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2], [M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() plt.tight_layout() # %% # Stability selection # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(logGeom[tr], y[tr], label = label_short) +problem = classo_problem(logGeom[tr], y[tr], label=label_short) -problem.formulation.w = 1/nleaves -problem.formulation.intercept = True +problem.formulation.w = 1 / nleaves +problem.formulation.intercept = True problem.formulation.concomitant = False -problem.model_selection.PATH = False -problem.model_selection.CV = False +problem.model_selection.PATH = False +problem.model_selection.CV = False # can change q, B, nS, method, threshold etc in problem.model_selection.StabSelparameters problem.solve() print(problem, problem.solution) -selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept +selection = problem.solution.StabSel.selected_param[1:] # exclude the intercept print(label[selection]) # %% @@ -122,10 +123,10 @@ te = np.array([i for i in range(len(y)) if not i in tr]) alpha = problem.solution.StabSel.refit -yhat = logGeom[te].dot(alpha[1:])+alpha[0] +yhat = logGeom[te].dot(alpha[1:]) + alpha[0] M1, M2 = max(y[te]), min(y[te]) -plt.plot(yhat, y[te], 'bo', label = 'sample of the testing set') -plt.plot([M1, M2],[M1, M2], 'k-', label = "identity") -plt.xlabel('predictor yhat'), plt.ylabel('real y'), plt.legend() -plt.tight_layout() \ No newline at end of file +plt.plot(yhat, y[te], "bo", label="sample of the testing set") +plt.plot([M1, M2], [M1, M2], "k-", label="identity") +plt.xlabel("predictor yhat"), plt.ylabel("real y"), plt.legend() +plt.tight_layout() diff --git a/examples/plot_advanced_example.py b/examples/plot_advanced_example.py index 237a3f1..7de8e54 100755 --- a/examples/plot_advanced_example.py +++ b/examples/plot_advanced_example.py @@ -13,47 +13,49 @@ # %% # Generate the data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # This code snippet generates a problem instance with sparse ß in dimension # d=100 (sparsity d_nonzero=5). The design matrix X comprises n=100 samples generated from an i.i.d standard normal -# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. +# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. # The input `zerosum=True` implies that C is the all-ones vector and Cß=0. The n-dimensional outcome vector y -# and the regression vector ß is then generated to satisfy the given constraints. +# and the regression vector ß is then generated to satisfy the given constraints. # One can then see the parameters that should be selected. m, d, d_nonzero, k, sigma = 100, 200, 5, 1, 0.5 -(X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=1, intercept=1.) +(X, C, y), sol = random_data( + m, d, d_nonzero, k, sigma, zerosum=True, seed=1, intercept=1.0 +) print(np.nonzero(sol)) # %% # Define the classo instance # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Next we can define a default c-lasso problem instance with the generated data: -problem = classo_problem(X, y, C) +problem = classo_problem(X, y, C) # %% # Change the parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Let's see some example of change in the parameters -problem.formulation.huber = True -problem.formulation.concomitant = False -problem.formulation.intercept = True -problem.model_selection.CV = True -problem.model_selection.LAMfixed = True -problem.model_selection.PATH = True -problem.model_selection.StabSelparameters.method = 'max' +problem.formulation.huber = True +problem.formulation.concomitant = False +problem.formulation.intercept = True +problem.model_selection.CV = True +problem.model_selection.LAMfixed = True +problem.model_selection.PATH = True +problem.model_selection.StabSelparameters.method = "max" problem.model_selection.CVparameters.seed = 1 problem.model_selection.LAMfixedparameters.rescaled_lam = True -problem.model_selection.LAMfixedparameters.lam = .1 +problem.model_selection.LAMfixedparameters.lam = 0.1 # %% # Check parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # You can look at the generated problem instance by typing: print(problem) @@ -61,9 +63,9 @@ # %% # Solve optimization problems # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We only use stability selection as default model selection strategy. -# The command also allows you to inspect the computed stability profile for all variables +# +# We only use stability selection as default model selection strategy. +# The command also allows you to inspect the computed stability profile for all variables # at the theoretical λ problem.solve() @@ -71,8 +73,8 @@ # %% # Visualisation # ^^^^^^^^^^^^^^^ -# -# After completion, the results of the optimization and model selection routines +# +# After completion, the results of the optimization and model selection routines # can be visualized using -print(problem.solution) \ No newline at end of file +print(problem.solution) diff --git a/examples/plot_basic_example.py b/examples/plot_basic_example.py index a6ac38d..0fd48c9 100755 --- a/examples/plot_basic_example.py +++ b/examples/plot_basic_example.py @@ -12,12 +12,12 @@ # %% # Generate the data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # This code snippet generates a problem instance with sparse ß in dimension # d=100 (sparsity d_nonzero=5). The design matrix X comprises n=100 samples generated from an i.i.d standard normal -# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. +# distribution. The dimension of the constraint matrix C is d x k matrix. The noise level is σ=0.5. # The input `zerosum=True` implies that C is the all-ones vector and Cß=0. The n-dimensional outcome vector y -# and the regression vector ß is then generated to satisfy the given constraints. +# and the regression vector ß is then generated to satisfy the given constraints. m, d, d_nonzero, k, sigma = 100, 200, 5, 1, 0.5 (X, C, y), sol = random_data(m, d, d_nonzero, k, sigma, zerosum=True, seed=1) @@ -30,15 +30,15 @@ # %% # Define the classo instance # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # Next we can define a default c-lasso problem instance with the generated data: -problem = classo_problem(X, y, C) +problem = classo_problem(X, y, C) # %% # Check parameters # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# +# # You can look at the generated problem instance by typing: print(problem) @@ -46,9 +46,9 @@ # %% # Solve optimization problems # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We only use stability selection as default model selection strategy. -# The command also allows you to inspect the computed stability profile for all variables +# +# We only use stability selection as default model selection strategy. +# The command also allows you to inspect the computed stability profile for all variables # at the theoretical λ problem.solve() @@ -56,8 +56,8 @@ # %% # Visualisation # ^^^^^^^^^^^^^^^ -# -# After completion, the results of the optimization and model selection routines +# +# After completion, the results of the optimization and model selection routines # can be visualized using -print(problem.solution) \ No newline at end of file +print(problem.solution) diff --git a/examples/plot_combo_example.py b/examples/plot_combo_example.py index 746aebd..35dd940 100755 --- a/examples/plot_combo_example.py +++ b/examples/plot_combo_example.py @@ -15,7 +15,8 @@ # Define how to read csv # ^^^^^^^^^^^^^^^^^^^^^^^^ -def csv_to_np(file, begin = 1, header = None): + +def csv_to_np(file, begin=1, header=None): """Function to read a csv file and to create an ndarray with this Args: @@ -26,7 +27,7 @@ def csv_to_np(file, begin = 1, header = None): Returns: ndarray : matrix of the csv file """ - tab1 = pd.read_csv(file, header = header) + tab1 = pd.read_csv(file, header=header) return np.array(tab1)[:, begin:] @@ -34,46 +35,46 @@ def csv_to_np(file, begin = 1, header = None): # Load microbiome and covariate data X # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -X0 = csv_to_np('COMBO_data/complete_data/GeneraCounts.csv', begin = 0).astype(float) -X_C = csv_to_np('COMBO_data/CaloriData.csv', begin = 0).astype(float) -X_F = csv_to_np('COMBO_data/FatData.csv', begin = 0).astype(float) +X0 = csv_to_np("COMBO_data/complete_data/GeneraCounts.csv", begin=0).astype(float) +X_C = csv_to_np("COMBO_data/CaloriData.csv", begin=0).astype(float) +X_F = csv_to_np("COMBO_data/FatData.csv", begin=0).astype(float) # %% # Load BMI measurements y # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -y = csv_to_np('COMBO_data/BMI.csv', begin = 0).astype(float)[:, 0] -labels = csv_to_np('COMBO_data/complete_data/GeneraPhylo.csv').astype(str)[:, -1] +y = csv_to_np("COMBO_data/BMI.csv", begin=0).astype(float)[:, 0] +labels = csv_to_np("COMBO_data/complete_data/GeneraPhylo.csv").astype(str)[:, -1] # %% # Normalize/transform data # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -y = y - np.mean(y) #BMI data (n = 96) -X_C = X_C - np.mean(X_C, axis = 0) #Covariate data (Calorie) -X_F = X_F - np.mean(X_F, axis = 0) #Covariate data (Fat) +y = y - np.mean(y) # BMI data (n = 96) +X_C = X_C - np.mean(X_C, axis=0) # Covariate data (Calorie) +X_F = X_F - np.mean(X_F, axis=0) # Covariate data (Fat) X0 = clr(X0, 1 / 2).T # %% # Set up design matrix and zero-sum constraints for 45 genera # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -X = np.concatenate((X0, X_C, X_F, axis = 1) # Joint microbiome and covariate data and offset -label = np.concatenate([labels, np.array(['Calorie', 'Fat'])]) +X = np.concatenate( + (X0, X_C, X_F), axis=1 +) # Joint microbiome and covariate data and offset +label = np.concatenate([labels, np.array(["Calorie", "Fat"])]) C = np.ones((1, len(X[0]))) -C[0, -1], C[0, -2] = 0., 0., 0. - - +C[0, -1], C[0, -2] = 0.0, 0.0, 0.0 # %% # Set up c-lassso problem # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -problem = classo_problem(X, y, C, label = label) +problem = classo_problem(X, y, C, label=label) problem.formulation.intercept = True # %% # Use stability selection with theoretical lambda [Combettes & Müller, 2020b] -problem.model_selection.StabSelparameters.method = 'lam' +problem.model_selection.StabSelparameters.method = "lam" problem.model_selection.StabSelparameters.threshold_label = 0.5 # %% diff --git a/setup.py b/setup.py index 17700b3..243bdde 100755 --- a/setup.py +++ b/setup.py @@ -1,44 +1,41 @@ from setuptools import setup, find_packages import versioneer -setup(name='c-lasso', - version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - license='MIT', - author='Leo Simpson', - url='https://github.com/Leo-Simpson/CLasso', - author_email='leo.bill.simpson@gmail.com', - description='Algorithms for constrained Lasso problems', - packages=['classo'], - install_requires = [ - 'numpy', - 'h5py', - 'scipy' - ], - extras_require = { - 'tests': [ - 'pytest', - 'pytest-cov'], - 'docs': [ - 'sphinx', - 'sphinx-gallery', - 'sphinx_rtd_theme', - 'numpydoc', - 'matplotlib', - 'pandas' - ] - }, - classifiers=['Intended Audience :: Science/Research', - 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX', - 'Operating System :: Unix', - 'Operating System :: MacOS', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - ], - long_description_content_type = 'text/markdown', - long_description=open('README.md').read(), - zip_safe=False) +setup( + name="c-lasso", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + license="MIT", + author="Leo Simpson", + url="https://github.com/Leo-Simpson/CLasso", + author_email="leo.bill.simpson@gmail.com", + description="Algorithms for constrained Lasso problems", + packages=["classo"], + install_requires=["numpy", "h5py", "scipy"], + extras_require={ + "tests": ["pytest", "pytest-cov"], + "docs": [ + "sphinx", + "sphinx-gallery", + "sphinx_rtd_theme", + "numpydoc", + "matplotlib", + "pandas", + ], + }, + classifiers=[ + "Intended Audience :: Science/Research", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + ], + long_description_content_type="text/markdown", + long_description=open("README.md").read(), + zip_safe=False, +) diff --git a/versioneer.py b/versioneer.py index 1040c21..d5e0eb5 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.19 """The Versioneer - like a rocketeer, but for versions. @@ -301,11 +300,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -318,8 +319,10 @@ def get_root(): me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) except NameError: pass return root @@ -341,6 +344,7 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None + cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" @@ -365,17 +369,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -383,10 +388,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -409,7 +417,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, p.returncode -LONG_VERSION_PY['git'] = r''' +LONG_VERSION_PY[ + "git" +] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -993,7 +1003,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1002,7 +1012,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1010,19 +1020,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1037,8 +1054,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1046,10 +1062,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1072,17 +1097,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1091,10 +1115,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1105,13 +1131,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] @@ -1170,16 +1196,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1208,11 +1240,13 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1221,8 +1255,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1254,8 +1287,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1369,11 +1401,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -1393,9 +1427,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } class VersioneerBadRootError(Exception): @@ -1418,8 +1456,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1473,9 +1512,13 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } def get_version(): @@ -1528,6 +1571,7 @@ def run(self): print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1546,8 +1590,8 @@ def run(self): # setup.py egg_info -> ? # we override different "build_py" commands for both environments - if 'build_py' in cmds: - _build_py = cmds['build_py'] + if "build_py" in cmds: + _build_py = cmds["build_py"] elif "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: @@ -1562,10 +1606,10 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "setuptools" in sys.modules: @@ -1587,14 +1631,15 @@ def run(self): return # now locate _version.py in the new build/ directory and replace # it with an updated value - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_source) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ @@ -1615,17 +1660,21 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if 'py2exe' in sys.modules: # py2exe enabled? + if "py2exe" in sys.modules: # py2exe enabled? from py2exe.distutils_buildexe import py2exe as _py2exe class cmd_py2exe(_py2exe): @@ -1641,18 +1690,22 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments - if 'sdist' in cmds: - _sdist = cmds['sdist'] + if "sdist" in cmds: + _sdist = cmds["sdist"] elif "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: @@ -1676,8 +1729,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -1732,11 +1787,13 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -1745,15 +1802,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -1795,8 +1855,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: