From c1342d4d0fc2ed5e35854a9bccd4bc8a6206cc7b Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Fri, 30 Apr 2021 16:00:10 -0400 Subject: [PATCH 01/35] cleanup --- .gitignore | 1 + src/Makevars | 3 - src/Makevars.win | 7 - src/Rcplex2.c | 375 --------------------------------- src/Rcplex2.h | 28 --- src/Rcplex_params.c | 261 ----------------------- src/Rcplex_utils.c | 102 --------- srcs/src.blank/Makevars | 3 - srcs/src.blank/Makevars.in | 3 - srcs/src.blank/Makevars.win | 7 - srcs/src.blank/Rcplex2.c | 23 -- srcs/src.blank/Rcplex2.h | 6 - srcs/src.rcplex/Rcplex2.c | 2 + srcs/src.rcplex/Rcplex2.h | 10 +- srcs/src.rcplex/Rcplex_utils.c | 5 + 15 files changed, 13 insertions(+), 823 deletions(-) delete mode 100644 src/Makevars delete mode 100644 src/Makevars.win delete mode 100644 src/Rcplex2.c delete mode 100644 src/Rcplex2.h delete mode 100644 src/Rcplex_params.c delete mode 100644 src/Rcplex_utils.c delete mode 100644 srcs/src.blank/Makevars delete mode 100644 srcs/src.blank/Makevars.in delete mode 100644 srcs/src.blank/Makevars.win delete mode 100644 srcs/src.blank/Rcplex2.c delete mode 100644 srcs/src.blank/Rcplex2.h diff --git a/.gitignore b/.gitignore index eb8b3afc..05fc351c 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,7 @@ docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.RData docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.rdb docs/debug.html docs/tutorial_2_files/figure-html/duplications and pyrgo-1.png +src/ tests/testthat/circos.pdf tests/testthat/test.json tests/testthat/test.pdf diff --git a/src/Makevars b/src/Makevars deleted file mode 100644 index 9d19401f..00000000 --- a/src/Makevars +++ /dev/null @@ -1,3 +0,0 @@ -PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing -PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include -PKG_LIBS=-L${CPLEX_DIR}/cplex/lib/x86-64_linux/static_pic -lcplex -lm -lpthread diff --git a/src/Makevars.win b/src/Makevars.win deleted file mode 100644 index 5c9fb6d9..00000000 --- a/src/Makevars.win +++ /dev/null @@ -1,7 +0,0 @@ -ifeq "$(WIN)" "64" -PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm -PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include" -else -PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm -PKG_CPPFLAGS = -I"${CPLEX_DIR}/include" -endif diff --git a/src/Rcplex2.c b/src/Rcplex2.c deleted file mode 100644 index 83c922b3..00000000 --- a/src/Rcplex2.c +++ /dev/null @@ -1,375 +0,0 @@ -// The actual solving procedure is called using the function Rcplex -#include "Rcplex2.h" - -int max_numcalls; - -SEXP Rcplex2(SEXP numcols_p, - SEXP numrows_p, - SEXP objsen_p, - SEXP cvec, - SEXP bvec, - SEXP Amat, - SEXP Qmat, - SEXP lb_p, - SEXP ub_p, - SEXP Rsense, - SEXP Rvtype, - SEXP isQP_p, - SEXP isMIP_p, - SEXP num_poplim, - SEXP control, - SEXP maxcalls, - SEXP tuning) -{ - char *probname = "Rcplex"; - int numcols = INTEGER(numcols_p)[0]; - int numrows = INTEGER(numrows_p)[0]; - int objsen = INTEGER(objsen_p)[0]; - double *lb = REAL(lb_p); - double *ub = REAL(ub_p); - - char sense[numrows]; - char vtype[numcols]; - // double dj[numrows]; - int isQP = INTEGER(isQP_p)[0]; - int isMIP = INTEGER(isMIP_p)[0]; - int trace = INTEGER(getListElement(control,"trace"))[0]; - - SEXP res = NULL; /* set to avoid uninitialized warning */ - SEXP xopt; - SEXP epgap; - SEXP obj; - SEXP solstat; - SEXP extra; - SEXP lambda; - SEXP slack; - SEXP nodecnt; - - int status; - int i, j; - int cur_numrows, cur_numcols; - - int tstat; - /* - * solution pools not supprted until cplex 11.0 - */ -#if CPX_VERSION >= 1100 - int num_sol = 1; - SEXP tmp; -#endif - - /* set maxnumcalls before init */ - max_numcalls = INTEGER(maxcalls)[0]; - /* initialize CPLEX environment */ - Rcplex_init(); - - if(trace) Rprintf("Rcplex: num variables=%d num constraints=%d\n",numcols,numrows); - - /* - * solution pools not supported until cplex 11.0 - */ -#if CPX_VERSION < 1100 - if (INTEGER(num_poplim)[0] > 1) { - warning("Multiple solutions not supported in CPLEX version"); - INTEGER(num_poplim)[0] = 1; - } -#endif - - /* lb and ub */ - for (j = 0; j < numcols; ++j) { - lb[j] = R_finite(lb[j]) ? lb[j] : -CPX_INFBOUND; - ub[j] = R_finite(ub[j]) ? ub[j] : CPX_INFBOUND; - } - - /* set constraint inequality directions */ - for (i = 0; i < numrows; ++i) { - sense[i] = CHAR(STRING_ELT(Rsense, i))[0]; - } - - /* set variable types */ - if (isMIP) { - for (i = 0; i < numcols; ++i) { - vtype[i] = CHAR(STRING_ELT(Rvtype, i))[0]; - } - } - - /* set parameters given in control list */ - setparams(env, control, isQP, isMIP); - lp = CPXcreateprob(env, &status, probname); - - - /* check memory problems */ - if (lp == NULL) { - my_error(("Failed to create LP.\n")); - } - - /* copy problem data into lp */ - status = CPXcopylp(env, lp, numcols, numrows, objsen,REAL(cvec), - REAL(bvec), sense, - INTEGER(VECTOR_ELT(Amat, 0)), - INTEGER(VECTOR_ELT(Amat, 1)), - INTEGER(VECTOR_ELT(Amat, 2)), - REAL(VECTOR_ELT(Amat, 3)), - lb, ub, NULL); - if (status) { - my_error(("Failed to copy problem data.\n")); - } - - if (isQP) { - status = CPXcopyquad(env, lp, - INTEGER(VECTOR_ELT(Qmat, 0)), - INTEGER(VECTOR_ELT(Qmat, 1)), - INTEGER(VECTOR_ELT(Qmat, 2)), - REAL(VECTOR_ELT(Qmat, 3))); - if (status) { - my_error(("Failed to copy quadratic term of problem data.\n")); - } - } - - if (isMIP) { - status = CPXcopyctype(env, lp, vtype); - if (status) { - my_error(("Failed to copy vtype.\n")); - } - } - - /* solve problem */ - if(isMIP) { - - /* MARCIN ADDED set starts */ - setstarts(env, lp, control, isMIP); - - int do_tuning = asLogical(tuning); - if (do_tuning){ - /* set the tuning total tilim to hard-setted value 600s */ - SEXP tuning_control = setListElement(control, "tilim", 600); - /* temporarily change the tilim*/ - setparams(env, tuning_control, isQP, isMIP); - - /* tune the hidden parameters */ - status = CPXtuneparam(env, lp, - 0, NULL, NULL, // zero fixed int par - 0, NULL, NULL, - 0, NULL, NULL, &tstat); - if (status) { - my_error(("Failed to tune params.")); - } - /* Write the optimized parameters to file */ - status = CPXwriteparam (env, "tuned.prm"); - - setparams(env, control, isQP, isMIP); - /* MARCIN ADDED set starts */ - setstarts(env, lp, control, isMIP); - } else { - /* Write the optimized parameters to file */ - status = CPXwriteparam (env, "raw.prm"); - } - - status = CPXmipopt(env, lp); - - /* - * solutions pool not supported for versions of cplex < 11 - * e.g., CPX_PARAM_POPULATELIM is not defined - */ - -#if CPX_VERSION >= 1100 - if(INTEGER(num_poplim)[0] > 1){ - /* in MIPs it is possible to have more than 1 solution. If - num_poplim > 1 we now try to find these solutions. - For populating the solution pool a couple of parameters are - relevant: - 1. the 'absolute gap' solution pool parameter - CPX_PARAM_SOLNPOOLAGAP (see Rcplex C control parameters), - 2. the 'solution pool intensity' parameter - CPX_PARAM_SOLNPOOLINTENSITY (see Rcplex C control parameters), - 3. the 'limit on number of solutions generated' for solution - pool CPX_PARAM_POPULATELIM */ - status = CPXsetintparam(env, CPX_PARAM_POPULATELIM, - INTEGER(num_poplim)[0] - 1); - if (status) { - my_error(("Failed to set 'populate limit' parameter.\n")); - } - /* now populate the solutions pool */ - status = CPXpopulate(env, lp); - } -#endif - } - else if (isQP) { - status = CPXqpopt(env, lp); - } - else { - status = CPXlpopt(env, lp); - } - - /* a status value of zero does not necessarily mean that an optimal - solution exists. Examples of an exit status of non-zero here - include CPXERR_NO_PROBLEM, CPXERR_NO_MEMORY, ... */ - if (status) { - my_error(("Failed to optimize problem.")); - } - - PROTECT(solstat = allocVector(INTSXP, 1)); - - /* retrieve status of optimization */ - *INTEGER(solstat) = CPXgetstat(env, lp); - - if (isMIP) { - /* MIP optimization */ - /* in MIPs it is possible to have more than 1 solution - FIXME: use solution pool by default? */ - - cur_numrows = CPXgetnumrows(env, lp); - cur_numcols = CPXgetnumcols(env, lp); - - /* if the 'n' parameter is set in R we always return a list with - a number of elements equal to the number of solutions */ - if(INTEGER(num_poplim)[0] > 1){ - /* - * solution pools not supported until cplex 11.0 - */ -#if CPX_VERSION >= 1100 - num_sol = CPXgetsolnpoolnumsolns(env, lp); - - /* MIP optimization results -> more than 1 solution */ - PROTECT(res = allocVector(VECSXP, num_sol)); - - /* now retrieve multiple solutions if any */ - for( i = 0; i < num_sol; i++){ - PROTECT(tmp = allocVector(VECSXP, 4)); - PROTECT(xopt = allocVector(REALSXP, cur_numcols)); - status = CPXgetsolnpoolx(env, lp, i, REAL(xopt), 0, cur_numcols - 1); - - SET_VECTOR_ELT(tmp, 0, xopt); - - PROTECT(obj = allocVector(REALSXP, 1)); - status = CPXgetsolnpoolobjval(env, lp, i, REAL(obj)); - /* if no integer solution exists, return NA */ - if (status) { - *REAL(obj) = NA_REAL; - } - SET_VECTOR_ELT(tmp, 1, obj); - - /* extra info */ - PROTECT(slack = allocVector(REALSXP, cur_numrows)); - status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1); - PROTECT(nodecnt = allocVector(INTSXP, 1)); - *INTEGER(nodecnt) = CPXgetnodecnt(env, lp); - - SET_VECTOR_ELT(tmp, 2, solstat); - - PROTECT(extra = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(extra, 0, nodecnt); - SET_VECTOR_ELT(extra, 1, slack); - SET_VECTOR_ELT(tmp, 3, extra); - - /* add solution to return vector */ - SET_VECTOR_ELT(res, i, tmp); - - UNPROTECT(6); - } /* END FOR */ -#endif /* end #if CPX_VERSION >= 1100 */ - - } /* END multiple solutions */ - else { - /* MIP optimization 1 solution */ - PROTECT(res = allocVector(VECSXP, 5)); - PROTECT(obj = allocVector(REALSXP, 1)); - PROTECT(xopt = allocVector(REALSXP, numcols)); - PROTECT(epgap = allocVector(REALSXP, 1)); /* added by Marcin */ - PROTECT(extra = allocVector(VECSXP, 2)); - PROTECT(slack = allocVector(REALSXP, numrows)); - - status = CPXgetmipobjval(env, lp, REAL(obj)); - /* if no integer solution exists, return NA */ - if (status) { - *REAL(obj) = NA_REAL; - } - - status = CPXgetmipx(env, lp, REAL(xopt), 0, cur_numcols - 1); - if (status) { - for(i = 0; i < cur_numcols; i++) - REAL(xopt)[i] = NA_REAL; - } - - status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1); - if (status) { - for(i = 0; i < cur_numrows; i++) - REAL(slack)[i] = NA_REAL; - } - - status = CPXgetmiprelgap(env, lp, REAL(epgap)); /* added by Marcin */ - - /* Provide some little extra information */ - PROTECT(nodecnt = allocVector(INTSXP, 1)); - *INTEGER(nodecnt) = CPXgetnodecnt(env, lp); - SET_VECTOR_ELT(extra, 0, nodecnt); - SET_VECTOR_ELT(extra, 1, slack); - - /* Create return vector for MIP 1 solution*/ - SET_VECTOR_ELT(res, 0, xopt); - SET_VECTOR_ELT(res, 1, obj); - SET_VECTOR_ELT(res, 2, solstat); - SET_VECTOR_ELT(res, 3, extra); - SET_VECTOR_ELT(res, 4, epgap); /* added by Marcin */ - - UNPROTECT(6); - } /* END MIP optimization 1 solution */ - } /* END MIP optimization */ - else { - /* continuous optimization */ - PROTECT(obj = allocVector(REALSXP, 1)); - PROTECT(xopt = allocVector(REALSXP, numcols)); - PROTECT(extra = allocVector(VECSXP, 2)); - PROTECT(slack = allocVector(REALSXP, numrows)); - - status = CPXgetobjval(env, lp, REAL(obj)); - if (status) { - *REAL(obj) = NA_REAL; - } - - cur_numrows = CPXgetnumrows(env, lp); - cur_numcols = CPXgetnumcols(env, lp); - status = CPXgetx(env, lp, REAL(xopt), 0, cur_numcols - 1); - if (status) { - for(i = 0; i < cur_numcols; i++) - REAL(xopt)[i] = NA_REAL; - } - - status = CPXgetslack(env, lp, REAL(slack), 0, cur_numrows - 1); - if (status) { - for(i = 0; i < cur_numrows; i++) - REAL(slack)[i] = NA_REAL; - } - - /* Provide some little extra information */ - PROTECT(lambda = allocVector(REALSXP, numrows)); - status = CPXgetpi(env, lp, REAL(lambda), 0, cur_numrows - 1); - if (status) { - for(i = 0; i < cur_numrows; i++) - REAL(lambda)[i] = NA_REAL; - } - SET_VECTOR_ELT(extra, 0, lambda); - SET_VECTOR_ELT(extra, 1, slack); - - /* Create return vector for continuous solution */ - PROTECT(res = allocVector(VECSXP,4)); - SET_VECTOR_ELT(res, 0, xopt); - SET_VECTOR_ELT(res, 1, obj); - SET_VECTOR_ELT(res, 2, solstat); - SET_VECTOR_ELT(res, 3, extra); - - UNPROTECT(5); - } /* END IF continuous optimization */ - - - UNPROTECT(2); /* unprotect return-vector res and solstat */ - - /* Reset all CPLEX parameters to default values */ - status = CPXsetdefaults(env); - if (status) { - Rprintf("Status: %d", status); - my_error(("Failed to set parameters to default.\n")); - } - return(res); -} - diff --git a/src/Rcplex2.h b/src/Rcplex2.h deleted file mode 100644 index f155a60d..00000000 --- a/src/Rcplex2.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _RCPLEX_H -#define _RCPLEX_H - -#include -#include -#include -#include -#include - -#define my_error(x) { forceCplxClose = 1; error x; } - -/* Global Variables */ -extern CPXENVptr env; -extern CPXLPptr lp; -extern int numcalls; -extern int max_numcalls; -extern int forceCplxClose; - -/* Function definitions */ -SEXP getListElement( SEXP, char* ); -SEXP setListElement( SEXP, char*, double); -void setparams( CPXENVptr, SEXP, int, int ); -void setstarts( CPXENVptr, CPXLPptr, SEXP, int ); -void Rcplex_init( void ); -void Rcplex_close( void ); -void Rcplex_free( void ); - -#endif diff --git a/src/Rcplex_params.c b/src/Rcplex_params.c deleted file mode 100644 index 45874ed6..00000000 --- a/src/Rcplex_params.c +++ /dev/null @@ -1,261 +0,0 @@ -#include "Rcplex2.h" - -void setparams(CPXENVptr env, SEXP control, int isQP, int isMIP) { - int i, status, value; - const char *cur_parm; - SEXP names; - - /* get list names */ - PROTECT(names = getAttrib(control, R_NamesSymbol)); - - status = 1; /* avoid warning */ - - /* for each element in 'control' try to set the corresponding - parameter */ - for (i = 0; i < length(control); i++) { - - cur_parm = CHAR(STRING_ELT(names, i)); - - /* trace - CPX_PARAM_SCRIND */ - if(strcmp(cur_parm, "trace") == 0) { - status = CPXsetintparam(env, CPX_PARAM_SCRIND, - *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF); - } - /* method */ - else if(strcmp(cur_parm, "method") == 0) { - switch((value = *INTEGER(VECTOR_ELT(control, i)))) { - case 0: - value = CPX_ALG_AUTOMATIC; - break; - case 1: - value = CPX_ALG_PRIMAL; - break; - case 2: - value = CPX_ALG_DUAL; - break; - case 3: - value = CPX_ALG_NET; - break; - case 4: - value = CPX_ALG_BARRIER; - break; - default: - warning("Unknown optimization method %d, using default\n", value); - value = CPX_ALG_AUTOMATIC; - } - /* Do we have a QP or a LP?*/ - if (isQP) - status = CPXsetintparam(env, CPX_PARAM_QPMETHOD, value); - else - status = CPXsetintparam(env, CPX_PARAM_LPMETHOD, value); - } - else if(strcmp(cur_parm, "preind") == 0) { - status = CPXsetintparam(env, CPX_PARAM_PREIND, - *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF); - } - else if(strcmp(cur_parm, "aggind") == 0) { - status = CPXsetintparam(env, CPX_PARAM_AGGIND, - *INTEGER(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "itlim") == 0) { - status = CPXsetintparam(env, CPX_PARAM_ITLIM, - *INTEGER(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "epgap") == 0) { - status = CPXsetdblparam(env, CPX_PARAM_EPGAP, - *REAL(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "epagap") == 0) { - status = CPXsetdblparam(env, CPX_PARAM_EPAGAP, - *REAL(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm,"tilim") == 0) { - status = CPXsetdblparam(env, CPX_PARAM_TILIM, - *REAL(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "mipemphasis") == 0) { - switch((value = *INTEGER(VECTOR_ELT(control, i)))) { - case 0: - value = CPX_MIPEMPHASIS_BALANCED; - break; - case 1: - value = CPX_MIPEMPHASIS_FEASIBILITY; - break; - case 2: - value = CPX_MIPEMPHASIS_OPTIMALITY; - break; - case 3: - value = CPX_MIPEMPHASIS_BESTBOUND; - break; - case 4: - value = CPX_MIPEMPHASIS_HIDDENFEAS; - break; - default: - warning("Unknown mip emphasis setting %d, using default\n", value); - value = CPX_MIPEMPHASIS_BALANCED; - } - status = CPXsetintparam(env, CPX_PARAM_MIPEMPHASIS, value); - } - else if(strcmp(cur_parm, "disjcuts") == 0) { - status = CPXsetintparam(env, CPX_PARAM_DISJCUTS, - *INTEGER(VECTOR_ELT(control,i))); - } - else if(strcmp(cur_parm, "cliques") == 0) { - status = CPXsetintparam(env, CPX_PARAM_CLIQUES, - *INTEGER(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm,"nodesel") == 0) { - switch((value = *INTEGER(VECTOR_ELT(control, i)))) { - case 0: - value = CPX_NODESEL_DFS; - break; - case 1: - value = CPX_NODESEL_BESTBOUND; - break; - case 2: - value = CPX_NODESEL_BESTEST; - break; - case 3: - value = CPX_NODESEL_BESTEST_ALT; - break; - default: - warning("Unknown node selection strategy %d, using default\n", value); - value = CPX_NODESEL_BESTBOUND; - } - status = CPXsetintparam(env, CPX_PARAM_NODESEL, value); - } - else if(strcmp(cur_parm, "probe") == 0) { - status = CPXsetintparam(env, CPX_PARAM_PROBE, - *INTEGER(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "varsel") == 0) { - switch((value = *INTEGER(VECTOR_ELT(control, i)))) { - case -1: - value = CPX_VARSEL_MININFEAS; - break; - case 0: - value = CPX_VARSEL_DEFAULT; - break; - case 1: - value = CPX_VARSEL_MAXINFEAS; - break; - case 2: - value = CPX_VARSEL_PSEUDO; - break; - case 3: - value = CPX_VARSEL_STRONG; - break; - case 4: - value = CPX_VARSEL_PSEUDOREDUCED; - break; - default: - warning("Unknown variable selection strategy %d, using default\n", - value); - value = CPX_VARSEL_DEFAULT; - } - status = CPXsetintparam(env, CPX_PARAM_VARSEL, value); - } - else if(strcmp(cur_parm, "flowcovers") == 0) { - status = CPXsetintparam(env, CPX_PARAM_FLOWCOVERS, - *INTEGER(VECTOR_ELT(control, i))); - } - else if(strcmp(cur_parm, "mipstart") == 0) { - } - else if(strcmp(cur_parm, "solnpoolagap") == 0){ - /* solution pool parameters */ - #if CPX_VERSION >= 1100 - status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLAGAP, - *REAL(VECTOR_ELT(control, i))); - #endif - } - else if(strcmp(cur_parm, "solnpoolgap") == 0){ - /* solution pool parameters */ - #if CPX_VERSION >= 1100 - status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLGAP, - *REAL(VECTOR_ELT(control, i))); - #endif - } - else if(strcmp(cur_parm, "solnpoolintensity") == 0){ - #if CPX_VERSION >= 1100 - status = CPXsetintparam(env, CPX_PARAM_SOLNPOOLINTENSITY, - *INTEGER(VECTOR_ELT(control, i))); - #endif - } - else if (strcmp(cur_parm, "breaksymmetry") == 0){ /*ADDED BY XT*/ - #if CPX_VERSION >= 1100 - status = CPXsetintparam(env, CPX_PARAM_SYMMETRY, - *INTEGER(VECTOR_ELT(control, i))); - #endif - } else if (strcmp(cur_parm, "tuning.display") == 0){ - #if CPX_VERSION >= 1100 - status = CPXsetintparam(env, CPX_PARAM_TUNINGDISPLAY, - *INTEGER(VECTOR_ELT(control, i))); - #endif - } else if (strcmp(cur_parm, "tuning.rep") == 0){ - #if CPX_VERSION >= 1100 - status = CPXsetintparam(env, CPX_PARAM_TUNINGREPEAT, - *INTEGER(VECTOR_ELT(control, i))); - #endif - } else if (strcmp(cur_parm, "tuning.tilim") == 0){ - #if CPX_VERSION >= 1100 - status = CPXsetdblparam(env, CPX_PARAM_TUNINGTILIM, - *REAL(VECTOR_ELT(control, i))); - #endif - } - else { - /* If parameter not known print a warning */ - warning("Unknown CPLEX parameter %s. Ignoring it.\n", cur_parm); - } - - if (status) - my_error(("Failure to set parameter %s, error %d.\n", cur_parm, status)); - } - - UNPROTECT(1); -} - -/* ADDED BY MARCIN!! - * - */ -void setstarts(CPXENVptr env, CPXLPptr lp, SEXP control, int isMIP) { - int i, status, value; - const char *cur_parm; - SEXP names; - - /* get list names */ - PROTECT(names = getAttrib(control, R_NamesSymbol)); - - status = 1; /* avoid warning */ - - /* for each element in 'control' try to set the corresponding - parameter */ - if (isMIP) { - for (i = 0; i < length(control); i++) { - - cur_parm = CHAR(STRING_ELT(names, i)); - - if(strcmp(cur_parm, "mipstart") == 0) { - SEXP values_vec = VECTOR_ELT(control, i); - double* values = REAL(values_vec); - int mcnt = 1; - int beg = 0; - int nzcnt = LENGTH(values_vec); - SEXP tmp = allocVector(INTSXP, nzcnt); - int *varindices = INTEGER(tmp); - int effortlevel = CPX_MIPSTART_AUTO; - char *mipstartname = "start"; - - for (int i = 0; i < nzcnt; i++) { - varindices[i] = i; - } - - status = CPXaddmipstarts(env, lp, mcnt, nzcnt, &beg, varindices, values, &effortlevel, &mipstartname); - - if (status) - my_error(("Failure to set parameter %s in setstarts, error %d.\n", cur_parm, status)); - } - } - } - - UNPROTECT(1); -} diff --git a/src/Rcplex_utils.c b/src/Rcplex_utils.c deleted file mode 100644 index b71f1a91..00000000 --- a/src/Rcplex_utils.c +++ /dev/null @@ -1,102 +0,0 @@ -// Utility functions like CPLEX initialization/closing routines and -// list accessors - -#include "Rcplex2.h" -#include - -CPXENVptr env; -CPXLPptr lp; -int numcalls; -int forceCplxClose; - -void Rcplex_wait (int seconds) { - clock_t endwait; - endwait = clock() + seconds * CLOCKS_PER_SEC; - while (clock() < endwait) {} -} - -void Rcplex_init(void) { - int numtries = 10; - int status; - char errmsg[1024]; - - /* Initialize CPLEX environment */ - if (env == NULL) { - env = CPXopenCPLEX (&status); - while(env == NULL && numtries > 0) { - Rcplex_wait(30); - numtries--; - } - if (env == NULL) { - CPXgeterrorstring(env, status, errmsg); - error("Could not open CPLEX environment.\n%s\n",errmsg); - } - REprintf("CPLEX environment opened\n"); - numcalls = max_numcalls; - } - else { - numcalls--; - } - forceCplxClose = 0; -} - -void Rcplex_close(void) { - forceCplxClose = 1; - Rcplex_free(); -} - -void Rcplex_free(void) { - int status1, status2; - char errmsg[1024]; - - status1 = status2 = 0; - if (lp != NULL) { - status1 = CPXfreeprob(env,&lp); - /* REprintf("Freed CPLEX problem\n");*/ - lp = NULL; - } - - if (env != NULL && (numcalls == 0 || forceCplxClose)) { - status2 = CPXcloseCPLEX(&env); - REprintf("Closed CPLEX environment\n"); - env = NULL; - } - - if (status1 || status2) { - status2 ? strcpy(errmsg,"env close ok") : CPXgeterrorstring(env,status2,errmsg); - error("Rcplex_free failed: free problem code: %d\nClose environment msg: %s\n",status1,errmsg); - } - return; -} - -SEXP getListElement(SEXP list, char *str) { - SEXP element = R_NilValue, names = getAttrib(list, R_NamesSymbol); - int i; - - for (i=0; i < length(list); i++) { - if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) { - element = VECTOR_ELT(list,i); - break; - } - } - return element; -} - -SEXP setListElement(SEXP list, char* str, double value) { - /* SEXP element = R_NilValue, */ - SEXP out = allocVector(VECSXP, length(list)); - SEXP names = getAttrib(list, R_NamesSymbol); - - setAttrib(out, R_NamesSymbol, names); - int i; - - for (i=0; i < length(list); i++) { - if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) { - /* element = VECTOR_ELT(list,i); */ - SET_VECTOR_ELT(out, i, ScalarReal(value)); - } else { - SET_VECTOR_ELT(out, i, VECTOR_ELT(list, i)); - } - } - return out; -} diff --git a/srcs/src.blank/Makevars b/srcs/src.blank/Makevars deleted file mode 100644 index 912a1707..00000000 --- a/srcs/src.blank/Makevars +++ /dev/null @@ -1,3 +0,0 @@ -PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing -PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include -PKG_LIBS=-lm -lpthread diff --git a/srcs/src.blank/Makevars.in b/srcs/src.blank/Makevars.in deleted file mode 100644 index f03b4d39..00000000 --- a/srcs/src.blank/Makevars.in +++ /dev/null @@ -1,3 +0,0 @@ -PKG_CFLAGS=@PKG_CFLAGS@ -PKG_CPPFLAGS=@PKG_CPPFLAGS@ -PKG_LIBS=@PKG_LIBS@ diff --git a/srcs/src.blank/Makevars.win b/srcs/src.blank/Makevars.win deleted file mode 100644 index 5c9fb6d9..00000000 --- a/srcs/src.blank/Makevars.win +++ /dev/null @@ -1,7 +0,0 @@ -ifeq "$(WIN)" "64" -PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm -PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include" -else -PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm -PKG_CPPFLAGS = -I"${CPLEX_DIR}/include" -endif diff --git a/srcs/src.blank/Rcplex2.c b/srcs/src.blank/Rcplex2.c deleted file mode 100644 index f649d71b..00000000 --- a/srcs/src.blank/Rcplex2.c +++ /dev/null @@ -1,23 +0,0 @@ -// The actual solving procedure is called using the function Rcplex -#include "Rcplex2.h" - -SEXP Rcplex2(SEXP numcols_p, - SEXP numrows_p, - SEXP objsen_p, - SEXP cvec, - SEXP bvec, - SEXP Amat, - SEXP Qmat, - SEXP lb_p, - SEXP ub_p, - SEXP Rsense, - SEXP Rvtype, - SEXP isQP_p, - SEXP isMIP_p, - SEXP num_poplim, - SEXP control, - SEXP maxcalls) -{ - return(numcols_p); -} - diff --git a/srcs/src.blank/Rcplex2.h b/srcs/src.blank/Rcplex2.h deleted file mode 100644 index 2062eb2e..00000000 --- a/srcs/src.blank/Rcplex2.h +++ /dev/null @@ -1,6 +0,0 @@ -#include -#include -#include -#include - - diff --git a/srcs/src.rcplex/Rcplex2.c b/srcs/src.rcplex/Rcplex2.c index 0862d38a..83c922b3 100644 --- a/srcs/src.rcplex/Rcplex2.c +++ b/srcs/src.rcplex/Rcplex2.c @@ -1,6 +1,8 @@ // The actual solving procedure is called using the function Rcplex #include "Rcplex2.h" +int max_numcalls; + SEXP Rcplex2(SEXP numcols_p, SEXP numrows_p, SEXP objsen_p, diff --git a/srcs/src.rcplex/Rcplex2.h b/srcs/src.rcplex/Rcplex2.h index 33bf5aa6..f155a60d 100644 --- a/srcs/src.rcplex/Rcplex2.h +++ b/srcs/src.rcplex/Rcplex2.h @@ -10,11 +10,11 @@ #define my_error(x) { forceCplxClose = 1; error x; } /* Global Variables */ -CPXENVptr env; -CPXLPptr lp; -int numcalls; -int max_numcalls; -int forceCplxClose; +extern CPXENVptr env; +extern CPXLPptr lp; +extern int numcalls; +extern int max_numcalls; +extern int forceCplxClose; /* Function definitions */ SEXP getListElement( SEXP, char* ); diff --git a/srcs/src.rcplex/Rcplex_utils.c b/srcs/src.rcplex/Rcplex_utils.c index 4f15ed4c..b71f1a91 100644 --- a/srcs/src.rcplex/Rcplex_utils.c +++ b/srcs/src.rcplex/Rcplex_utils.c @@ -4,6 +4,11 @@ #include "Rcplex2.h" #include +CPXENVptr env; +CPXLPptr lp; +int numcalls; +int forceCplxClose; + void Rcplex_wait (int seconds) { clock_t endwait; endwait = clock() + seconds * CLOCKS_PER_SEC; From c7744ee994026007300580d7851ae0c8b4083832 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Fri, 30 Apr 2021 16:09:21 -0400 Subject: [PATCH 02/35] get rid of unused parameters --- R/apps.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/apps.R b/R/apps.R index 96d8fda8..1530b528 100644 --- a/R/apps.R +++ b/R/apps.R @@ -60,7 +60,6 @@ #' @param verbose (integer)scalar specifying whether to do verbose output, value 2 will spit out MIP (1) #' @param tilim (numeric) time limit on MIP in seconds (10) #' @param epgap (numeric) relative optimality gap threshhold between 0 and 1 (default 1e-3) -#' @param nsol (integer) number of solutions (default 1) #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE) #' #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda. @@ -82,7 +81,6 @@ balance = function(gg, verbose = 1, tilim = 10, epgap = 1e-3, - nsol = 1, debug = FALSE) { if (verbose) { From a25e97ebdec661e8c0f88453fdc4f5326b06a3c5 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sat, 1 May 2021 19:08:18 -0400 Subject: [PATCH 03/35] initial commit apps.R --- R/apps.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/apps.R b/R/apps.R index 1530b528..7d7daac7 100644 --- a/R/apps.R +++ b/R/apps.R @@ -61,6 +61,7 @@ #' @param tilim (numeric) time limit on MIP in seconds (10) #' @param epgap (numeric) relative optimality gap threshhold between 0 and 1 (default 1e-3) #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE) +#' @param gurobi (logical) use gurobi if TRUE uses gurobi else CPLEX default FALSE #' #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda. #' @author Marcin Imielinski @@ -81,7 +82,8 @@ balance = function(gg, verbose = 1, tilim = 10, epgap = 1e-3, - debug = FALSE) + debug = FALSE, + gurobi = FALSE) { if (verbose) { message("creating copy of input gGraph") From f582cd40f3ff9af5f4f7d639e80629c1398a573f Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sun, 2 May 2021 18:51:03 -0400 Subject: [PATCH 04/35] initial commit gurobi wrapper --- R/utils.R | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/R/utils.R b/R/utils.R index 8d63ab00..0d824287 100755 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,68 @@ ## appease R CMD check vs data.table sid=side1=side2=side_1=side_2=silent=snid=splice.variant=splicevar=str1=str2=strand_1=strand_2=subject.id=suffix=tag=threep.cc=threep.coord=threep.exon=threep.frame=threep.pc=threep.sc=threep.sc.frame=to=transcript.id.x=transcript.id.y=transcript_associated=transcript_id=twidth=tx.cc=tx.ec=tx_strand=tx_strand.x=tx_strand.y=txid=type=uid=uids=val=walk.id=walk.iid=walk.iid.x=walk.iid.y=wkid=NULL +#' @name run.gurobi +#' @title run.gurobi +#' +#' @description +#' wrapper to run gurobi with CPLEX-like function call for easy switching bw optimizers +#' +#' @param cvec (numeric) +#' @param Amat (sparse matrix) +#' @param bvec (numeric) +#' @param Qmat (sparse matrix) +#' @param lb (numeric) +#' @param ub (numeric) +#' @param sense (character) +#' @param vtype (variable type) +#' @param objsense +#' @param control +#' +#' @return sol - list with names $x, $epgap, $status +run.gurobi = function(cvec = NULL, + Amat = NULL, + bvec = NULL + Qmat = NULL, + lb = NULL, + ub = NULL, + sense = NULL, + vtype = NULL, + objsense = NULL, + control = NULL) { + + ## build model + model = list( + obj = cvec, + A = Amat, + rhs = bvec, + Q = Qmat, + lb = lb, + ub = ub, + vtype = vtype, + sense = c("E"="=", "G"=">", "L"="<")[sense] ## inequalities are leq, geq (e.g. not strict) + modelsense = objsense) + + ## params + params = list() + if (!is.null(control$epgap)) { + params$MIPGap = epgap + } + if (!is.null(control$tilim)) { + params$TimeLimit = tilim + } + + ## TODO: set up env list for running on compute cluster + + ## run gurobi + sol = gurobi::gurobi(model = model, params = params) + + ## make solution consistent with Rcplex output + ## but return all the things + sol$epgap = sol$mipgap + + return(sol) +} + #' @name duplicated.matrix #' @title R-3.5.1 version of duplicated.matrix From c1360765a1a102012296eeb232d974093435b333 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sun, 2 May 2021 21:30:03 -0400 Subject: [PATCH 05/35] expose threads parameter --- R/utils.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/R/utils.R b/R/utils.R index 0d824287..75e7cc0b 100755 --- a/R/utils.R +++ b/R/utils.R @@ -15,20 +15,24 @@ sid=side1=side2=side_1=side_2=silent=snid=splice.variant=splicevar=str1=str2=str #' @param ub (numeric) #' @param sense (character) #' @param vtype (variable type) -#' @param objsense -#' @param control +#' @param objsense (character) default min +#' @param control (list) should have epgap, tilim, trace, ideally +#' @param threads (numeric) #' #' @return sol - list with names $x, $epgap, $status run.gurobi = function(cvec = NULL, Amat = NULL, - bvec = NULL + bvec = NULL, Qmat = NULL, lb = NULL, ub = NULL, sense = NULL, vtype = NULL, - objsense = NULL, - control = NULL) { + objsense = 'min', + control = list(epgap = 1e-2, tilim = 360, trace = 2), + threads = 32) { + + browser() ## build model model = list( @@ -39,7 +43,7 @@ run.gurobi = function(cvec = NULL, lb = lb, ub = ub, vtype = vtype, - sense = c("E"="=", "G"=">", "L"="<")[sense] ## inequalities are leq, geq (e.g. not strict) + sense = c("E"="=", "G"=">", "L"="<")[sense], ## inequalities are leq, geq (e.g. not strict) modelsense = objsense) ## params @@ -50,6 +54,9 @@ run.gurobi = function(cvec = NULL, if (!is.null(control$tilim)) { params$TimeLimit = tilim } + if (!is.null(control$trace)) { + params$LogToConsole = ifelse(control$trace > 0, 1, 0) + } ## TODO: set up env list for running on compute cluster From a9313a53fc3c4aebf69c0c9c143e413ae6bd3b28 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sun, 2 May 2021 21:45:47 -0400 Subject: [PATCH 06/35] add gurobi option to balance --- R/apps.R | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/R/apps.R b/R/apps.R index 7d7daac7..08aca630 100644 --- a/R/apps.R +++ b/R/apps.R @@ -1302,18 +1302,33 @@ balance = function(gg, control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1) ## sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control) - ## call our wrapper for CPLEX - sol = Rcplex2(cvec, - Amat, - bvec, - Qmat = Qmat, - lb = lb, - ub = ub, - sense = sense, - vtype = vars$vtype, - objsense = "min", - control = control, - tuning = FALSE) + if (gurobi) { + sol = run.gurobi( + cvec = cvec, + Amat = Amat, + bvec = bvec, + Qmat = Qmat, + lb = lb, + ub = ub, + sense = sense, + vtype = vars$vtype, + objsense = 'min', + control = control, + threads = 32) + } else { + ## call our wrapper for CPLEX + sol = Rcplex2(cvec, + Amat, + bvec, + Qmat = Qmat, + lb = lb, + ub = ub, + sense = sense, + vtype = vars$vtype, + objsense = "min", + control = control, + tuning = FALSE) + } vars$cvec = cvec vars$x = sol$x From ce35a7d32473bfda96f83fc686c9732ba33c37ae Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sun, 2 May 2021 21:46:00 -0400 Subject: [PATCH 07/35] bug fix in setting up parameters --- R/utils.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index 75e7cc0b..96da4ce4 100755 --- a/R/utils.R +++ b/R/utils.R @@ -32,8 +32,6 @@ run.gurobi = function(cvec = NULL, control = list(epgap = 1e-2, tilim = 360, trace = 2), threads = 32) { - browser() - ## build model model = list( obj = cvec, @@ -49,10 +47,10 @@ run.gurobi = function(cvec = NULL, ## params params = list() if (!is.null(control$epgap)) { - params$MIPGap = epgap + params$MIPGap = control$epgap } if (!is.null(control$tilim)) { - params$TimeLimit = tilim + params$TimeLimit = control$tilim } if (!is.null(control$trace)) { params$LogToConsole = ifelse(control$trace > 0, 1, 0) From c3351550a3e22e312f57e5150de6d1cd1ab3aee2 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 3 May 2021 14:54:09 -0400 Subject: [PATCH 08/35] UDKJF --- .gitignore | 1 - src/Makevars | 3 + src/Makevars.win | 7 + src/Rcplex2.c | 375 ++++++++++++++++++++++++++++++++++++++++++++ src/Rcplex2.h | 28 ++++ src/Rcplex_params.c | 261 ++++++++++++++++++++++++++++++ src/Rcplex_utils.c | 102 ++++++++++++ 7 files changed, 776 insertions(+), 1 deletion(-) create mode 100644 src/Makevars create mode 100644 src/Makevars.win create mode 100644 src/Rcplex2.c create mode 100644 src/Rcplex2.h create mode 100644 src/Rcplex_params.c create mode 100644 src/Rcplex_utils.c diff --git a/.gitignore b/.gitignore index 05fc351c..eb8b3afc 100644 --- a/.gitignore +++ b/.gitignore @@ -71,7 +71,6 @@ docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.RData docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.rdb docs/debug.html docs/tutorial_2_files/figure-html/duplications and pyrgo-1.png -src/ tests/testthat/circos.pdf tests/testthat/test.json tests/testthat/test.pdf diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 00000000..9d19401f --- /dev/null +++ b/src/Makevars @@ -0,0 +1,3 @@ +PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing +PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include +PKG_LIBS=-L${CPLEX_DIR}/cplex/lib/x86-64_linux/static_pic -lcplex -lm -lpthread diff --git a/src/Makevars.win b/src/Makevars.win new file mode 100644 index 00000000..5c9fb6d9 --- /dev/null +++ b/src/Makevars.win @@ -0,0 +1,7 @@ +ifeq "$(WIN)" "64" +PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm +PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include" +else +PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm +PKG_CPPFLAGS = -I"${CPLEX_DIR}/include" +endif diff --git a/src/Rcplex2.c b/src/Rcplex2.c new file mode 100644 index 00000000..83c922b3 --- /dev/null +++ b/src/Rcplex2.c @@ -0,0 +1,375 @@ +// The actual solving procedure is called using the function Rcplex +#include "Rcplex2.h" + +int max_numcalls; + +SEXP Rcplex2(SEXP numcols_p, + SEXP numrows_p, + SEXP objsen_p, + SEXP cvec, + SEXP bvec, + SEXP Amat, + SEXP Qmat, + SEXP lb_p, + SEXP ub_p, + SEXP Rsense, + SEXP Rvtype, + SEXP isQP_p, + SEXP isMIP_p, + SEXP num_poplim, + SEXP control, + SEXP maxcalls, + SEXP tuning) +{ + char *probname = "Rcplex"; + int numcols = INTEGER(numcols_p)[0]; + int numrows = INTEGER(numrows_p)[0]; + int objsen = INTEGER(objsen_p)[0]; + double *lb = REAL(lb_p); + double *ub = REAL(ub_p); + + char sense[numrows]; + char vtype[numcols]; + // double dj[numrows]; + int isQP = INTEGER(isQP_p)[0]; + int isMIP = INTEGER(isMIP_p)[0]; + int trace = INTEGER(getListElement(control,"trace"))[0]; + + SEXP res = NULL; /* set to avoid uninitialized warning */ + SEXP xopt; + SEXP epgap; + SEXP obj; + SEXP solstat; + SEXP extra; + SEXP lambda; + SEXP slack; + SEXP nodecnt; + + int status; + int i, j; + int cur_numrows, cur_numcols; + + int tstat; + /* + * solution pools not supprted until cplex 11.0 + */ +#if CPX_VERSION >= 1100 + int num_sol = 1; + SEXP tmp; +#endif + + /* set maxnumcalls before init */ + max_numcalls = INTEGER(maxcalls)[0]; + /* initialize CPLEX environment */ + Rcplex_init(); + + if(trace) Rprintf("Rcplex: num variables=%d num constraints=%d\n",numcols,numrows); + + /* + * solution pools not supported until cplex 11.0 + */ +#if CPX_VERSION < 1100 + if (INTEGER(num_poplim)[0] > 1) { + warning("Multiple solutions not supported in CPLEX version"); + INTEGER(num_poplim)[0] = 1; + } +#endif + + /* lb and ub */ + for (j = 0; j < numcols; ++j) { + lb[j] = R_finite(lb[j]) ? lb[j] : -CPX_INFBOUND; + ub[j] = R_finite(ub[j]) ? ub[j] : CPX_INFBOUND; + } + + /* set constraint inequality directions */ + for (i = 0; i < numrows; ++i) { + sense[i] = CHAR(STRING_ELT(Rsense, i))[0]; + } + + /* set variable types */ + if (isMIP) { + for (i = 0; i < numcols; ++i) { + vtype[i] = CHAR(STRING_ELT(Rvtype, i))[0]; + } + } + + /* set parameters given in control list */ + setparams(env, control, isQP, isMIP); + lp = CPXcreateprob(env, &status, probname); + + + /* check memory problems */ + if (lp == NULL) { + my_error(("Failed to create LP.\n")); + } + + /* copy problem data into lp */ + status = CPXcopylp(env, lp, numcols, numrows, objsen,REAL(cvec), + REAL(bvec), sense, + INTEGER(VECTOR_ELT(Amat, 0)), + INTEGER(VECTOR_ELT(Amat, 1)), + INTEGER(VECTOR_ELT(Amat, 2)), + REAL(VECTOR_ELT(Amat, 3)), + lb, ub, NULL); + if (status) { + my_error(("Failed to copy problem data.\n")); + } + + if (isQP) { + status = CPXcopyquad(env, lp, + INTEGER(VECTOR_ELT(Qmat, 0)), + INTEGER(VECTOR_ELT(Qmat, 1)), + INTEGER(VECTOR_ELT(Qmat, 2)), + REAL(VECTOR_ELT(Qmat, 3))); + if (status) { + my_error(("Failed to copy quadratic term of problem data.\n")); + } + } + + if (isMIP) { + status = CPXcopyctype(env, lp, vtype); + if (status) { + my_error(("Failed to copy vtype.\n")); + } + } + + /* solve problem */ + if(isMIP) { + + /* MARCIN ADDED set starts */ + setstarts(env, lp, control, isMIP); + + int do_tuning = asLogical(tuning); + if (do_tuning){ + /* set the tuning total tilim to hard-setted value 600s */ + SEXP tuning_control = setListElement(control, "tilim", 600); + /* temporarily change the tilim*/ + setparams(env, tuning_control, isQP, isMIP); + + /* tune the hidden parameters */ + status = CPXtuneparam(env, lp, + 0, NULL, NULL, // zero fixed int par + 0, NULL, NULL, + 0, NULL, NULL, &tstat); + if (status) { + my_error(("Failed to tune params.")); + } + /* Write the optimized parameters to file */ + status = CPXwriteparam (env, "tuned.prm"); + + setparams(env, control, isQP, isMIP); + /* MARCIN ADDED set starts */ + setstarts(env, lp, control, isMIP); + } else { + /* Write the optimized parameters to file */ + status = CPXwriteparam (env, "raw.prm"); + } + + status = CPXmipopt(env, lp); + + /* + * solutions pool not supported for versions of cplex < 11 + * e.g., CPX_PARAM_POPULATELIM is not defined + */ + +#if CPX_VERSION >= 1100 + if(INTEGER(num_poplim)[0] > 1){ + /* in MIPs it is possible to have more than 1 solution. If + num_poplim > 1 we now try to find these solutions. + For populating the solution pool a couple of parameters are + relevant: + 1. the 'absolute gap' solution pool parameter + CPX_PARAM_SOLNPOOLAGAP (see Rcplex C control parameters), + 2. the 'solution pool intensity' parameter + CPX_PARAM_SOLNPOOLINTENSITY (see Rcplex C control parameters), + 3. the 'limit on number of solutions generated' for solution + pool CPX_PARAM_POPULATELIM */ + status = CPXsetintparam(env, CPX_PARAM_POPULATELIM, + INTEGER(num_poplim)[0] - 1); + if (status) { + my_error(("Failed to set 'populate limit' parameter.\n")); + } + /* now populate the solutions pool */ + status = CPXpopulate(env, lp); + } +#endif + } + else if (isQP) { + status = CPXqpopt(env, lp); + } + else { + status = CPXlpopt(env, lp); + } + + /* a status value of zero does not necessarily mean that an optimal + solution exists. Examples of an exit status of non-zero here + include CPXERR_NO_PROBLEM, CPXERR_NO_MEMORY, ... */ + if (status) { + my_error(("Failed to optimize problem.")); + } + + PROTECT(solstat = allocVector(INTSXP, 1)); + + /* retrieve status of optimization */ + *INTEGER(solstat) = CPXgetstat(env, lp); + + if (isMIP) { + /* MIP optimization */ + /* in MIPs it is possible to have more than 1 solution + FIXME: use solution pool by default? */ + + cur_numrows = CPXgetnumrows(env, lp); + cur_numcols = CPXgetnumcols(env, lp); + + /* if the 'n' parameter is set in R we always return a list with + a number of elements equal to the number of solutions */ + if(INTEGER(num_poplim)[0] > 1){ + /* + * solution pools not supported until cplex 11.0 + */ +#if CPX_VERSION >= 1100 + num_sol = CPXgetsolnpoolnumsolns(env, lp); + + /* MIP optimization results -> more than 1 solution */ + PROTECT(res = allocVector(VECSXP, num_sol)); + + /* now retrieve multiple solutions if any */ + for( i = 0; i < num_sol; i++){ + PROTECT(tmp = allocVector(VECSXP, 4)); + PROTECT(xopt = allocVector(REALSXP, cur_numcols)); + status = CPXgetsolnpoolx(env, lp, i, REAL(xopt), 0, cur_numcols - 1); + + SET_VECTOR_ELT(tmp, 0, xopt); + + PROTECT(obj = allocVector(REALSXP, 1)); + status = CPXgetsolnpoolobjval(env, lp, i, REAL(obj)); + /* if no integer solution exists, return NA */ + if (status) { + *REAL(obj) = NA_REAL; + } + SET_VECTOR_ELT(tmp, 1, obj); + + /* extra info */ + PROTECT(slack = allocVector(REALSXP, cur_numrows)); + status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1); + PROTECT(nodecnt = allocVector(INTSXP, 1)); + *INTEGER(nodecnt) = CPXgetnodecnt(env, lp); + + SET_VECTOR_ELT(tmp, 2, solstat); + + PROTECT(extra = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(extra, 0, nodecnt); + SET_VECTOR_ELT(extra, 1, slack); + SET_VECTOR_ELT(tmp, 3, extra); + + /* add solution to return vector */ + SET_VECTOR_ELT(res, i, tmp); + + UNPROTECT(6); + } /* END FOR */ +#endif /* end #if CPX_VERSION >= 1100 */ + + } /* END multiple solutions */ + else { + /* MIP optimization 1 solution */ + PROTECT(res = allocVector(VECSXP, 5)); + PROTECT(obj = allocVector(REALSXP, 1)); + PROTECT(xopt = allocVector(REALSXP, numcols)); + PROTECT(epgap = allocVector(REALSXP, 1)); /* added by Marcin */ + PROTECT(extra = allocVector(VECSXP, 2)); + PROTECT(slack = allocVector(REALSXP, numrows)); + + status = CPXgetmipobjval(env, lp, REAL(obj)); + /* if no integer solution exists, return NA */ + if (status) { + *REAL(obj) = NA_REAL; + } + + status = CPXgetmipx(env, lp, REAL(xopt), 0, cur_numcols - 1); + if (status) { + for(i = 0; i < cur_numcols; i++) + REAL(xopt)[i] = NA_REAL; + } + + status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1); + if (status) { + for(i = 0; i < cur_numrows; i++) + REAL(slack)[i] = NA_REAL; + } + + status = CPXgetmiprelgap(env, lp, REAL(epgap)); /* added by Marcin */ + + /* Provide some little extra information */ + PROTECT(nodecnt = allocVector(INTSXP, 1)); + *INTEGER(nodecnt) = CPXgetnodecnt(env, lp); + SET_VECTOR_ELT(extra, 0, nodecnt); + SET_VECTOR_ELT(extra, 1, slack); + + /* Create return vector for MIP 1 solution*/ + SET_VECTOR_ELT(res, 0, xopt); + SET_VECTOR_ELT(res, 1, obj); + SET_VECTOR_ELT(res, 2, solstat); + SET_VECTOR_ELT(res, 3, extra); + SET_VECTOR_ELT(res, 4, epgap); /* added by Marcin */ + + UNPROTECT(6); + } /* END MIP optimization 1 solution */ + } /* END MIP optimization */ + else { + /* continuous optimization */ + PROTECT(obj = allocVector(REALSXP, 1)); + PROTECT(xopt = allocVector(REALSXP, numcols)); + PROTECT(extra = allocVector(VECSXP, 2)); + PROTECT(slack = allocVector(REALSXP, numrows)); + + status = CPXgetobjval(env, lp, REAL(obj)); + if (status) { + *REAL(obj) = NA_REAL; + } + + cur_numrows = CPXgetnumrows(env, lp); + cur_numcols = CPXgetnumcols(env, lp); + status = CPXgetx(env, lp, REAL(xopt), 0, cur_numcols - 1); + if (status) { + for(i = 0; i < cur_numcols; i++) + REAL(xopt)[i] = NA_REAL; + } + + status = CPXgetslack(env, lp, REAL(slack), 0, cur_numrows - 1); + if (status) { + for(i = 0; i < cur_numrows; i++) + REAL(slack)[i] = NA_REAL; + } + + /* Provide some little extra information */ + PROTECT(lambda = allocVector(REALSXP, numrows)); + status = CPXgetpi(env, lp, REAL(lambda), 0, cur_numrows - 1); + if (status) { + for(i = 0; i < cur_numrows; i++) + REAL(lambda)[i] = NA_REAL; + } + SET_VECTOR_ELT(extra, 0, lambda); + SET_VECTOR_ELT(extra, 1, slack); + + /* Create return vector for continuous solution */ + PROTECT(res = allocVector(VECSXP,4)); + SET_VECTOR_ELT(res, 0, xopt); + SET_VECTOR_ELT(res, 1, obj); + SET_VECTOR_ELT(res, 2, solstat); + SET_VECTOR_ELT(res, 3, extra); + + UNPROTECT(5); + } /* END IF continuous optimization */ + + + UNPROTECT(2); /* unprotect return-vector res and solstat */ + + /* Reset all CPLEX parameters to default values */ + status = CPXsetdefaults(env); + if (status) { + Rprintf("Status: %d", status); + my_error(("Failed to set parameters to default.\n")); + } + return(res); +} + diff --git a/src/Rcplex2.h b/src/Rcplex2.h new file mode 100644 index 00000000..f155a60d --- /dev/null +++ b/src/Rcplex2.h @@ -0,0 +1,28 @@ +#ifndef _RCPLEX_H +#define _RCPLEX_H + +#include +#include +#include +#include +#include + +#define my_error(x) { forceCplxClose = 1; error x; } + +/* Global Variables */ +extern CPXENVptr env; +extern CPXLPptr lp; +extern int numcalls; +extern int max_numcalls; +extern int forceCplxClose; + +/* Function definitions */ +SEXP getListElement( SEXP, char* ); +SEXP setListElement( SEXP, char*, double); +void setparams( CPXENVptr, SEXP, int, int ); +void setstarts( CPXENVptr, CPXLPptr, SEXP, int ); +void Rcplex_init( void ); +void Rcplex_close( void ); +void Rcplex_free( void ); + +#endif diff --git a/src/Rcplex_params.c b/src/Rcplex_params.c new file mode 100644 index 00000000..45874ed6 --- /dev/null +++ b/src/Rcplex_params.c @@ -0,0 +1,261 @@ +#include "Rcplex2.h" + +void setparams(CPXENVptr env, SEXP control, int isQP, int isMIP) { + int i, status, value; + const char *cur_parm; + SEXP names; + + /* get list names */ + PROTECT(names = getAttrib(control, R_NamesSymbol)); + + status = 1; /* avoid warning */ + + /* for each element in 'control' try to set the corresponding + parameter */ + for (i = 0; i < length(control); i++) { + + cur_parm = CHAR(STRING_ELT(names, i)); + + /* trace - CPX_PARAM_SCRIND */ + if(strcmp(cur_parm, "trace") == 0) { + status = CPXsetintparam(env, CPX_PARAM_SCRIND, + *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF); + } + /* method */ + else if(strcmp(cur_parm, "method") == 0) { + switch((value = *INTEGER(VECTOR_ELT(control, i)))) { + case 0: + value = CPX_ALG_AUTOMATIC; + break; + case 1: + value = CPX_ALG_PRIMAL; + break; + case 2: + value = CPX_ALG_DUAL; + break; + case 3: + value = CPX_ALG_NET; + break; + case 4: + value = CPX_ALG_BARRIER; + break; + default: + warning("Unknown optimization method %d, using default\n", value); + value = CPX_ALG_AUTOMATIC; + } + /* Do we have a QP or a LP?*/ + if (isQP) + status = CPXsetintparam(env, CPX_PARAM_QPMETHOD, value); + else + status = CPXsetintparam(env, CPX_PARAM_LPMETHOD, value); + } + else if(strcmp(cur_parm, "preind") == 0) { + status = CPXsetintparam(env, CPX_PARAM_PREIND, + *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF); + } + else if(strcmp(cur_parm, "aggind") == 0) { + status = CPXsetintparam(env, CPX_PARAM_AGGIND, + *INTEGER(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "itlim") == 0) { + status = CPXsetintparam(env, CPX_PARAM_ITLIM, + *INTEGER(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "epgap") == 0) { + status = CPXsetdblparam(env, CPX_PARAM_EPGAP, + *REAL(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "epagap") == 0) { + status = CPXsetdblparam(env, CPX_PARAM_EPAGAP, + *REAL(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm,"tilim") == 0) { + status = CPXsetdblparam(env, CPX_PARAM_TILIM, + *REAL(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "mipemphasis") == 0) { + switch((value = *INTEGER(VECTOR_ELT(control, i)))) { + case 0: + value = CPX_MIPEMPHASIS_BALANCED; + break; + case 1: + value = CPX_MIPEMPHASIS_FEASIBILITY; + break; + case 2: + value = CPX_MIPEMPHASIS_OPTIMALITY; + break; + case 3: + value = CPX_MIPEMPHASIS_BESTBOUND; + break; + case 4: + value = CPX_MIPEMPHASIS_HIDDENFEAS; + break; + default: + warning("Unknown mip emphasis setting %d, using default\n", value); + value = CPX_MIPEMPHASIS_BALANCED; + } + status = CPXsetintparam(env, CPX_PARAM_MIPEMPHASIS, value); + } + else if(strcmp(cur_parm, "disjcuts") == 0) { + status = CPXsetintparam(env, CPX_PARAM_DISJCUTS, + *INTEGER(VECTOR_ELT(control,i))); + } + else if(strcmp(cur_parm, "cliques") == 0) { + status = CPXsetintparam(env, CPX_PARAM_CLIQUES, + *INTEGER(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm,"nodesel") == 0) { + switch((value = *INTEGER(VECTOR_ELT(control, i)))) { + case 0: + value = CPX_NODESEL_DFS; + break; + case 1: + value = CPX_NODESEL_BESTBOUND; + break; + case 2: + value = CPX_NODESEL_BESTEST; + break; + case 3: + value = CPX_NODESEL_BESTEST_ALT; + break; + default: + warning("Unknown node selection strategy %d, using default\n", value); + value = CPX_NODESEL_BESTBOUND; + } + status = CPXsetintparam(env, CPX_PARAM_NODESEL, value); + } + else if(strcmp(cur_parm, "probe") == 0) { + status = CPXsetintparam(env, CPX_PARAM_PROBE, + *INTEGER(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "varsel") == 0) { + switch((value = *INTEGER(VECTOR_ELT(control, i)))) { + case -1: + value = CPX_VARSEL_MININFEAS; + break; + case 0: + value = CPX_VARSEL_DEFAULT; + break; + case 1: + value = CPX_VARSEL_MAXINFEAS; + break; + case 2: + value = CPX_VARSEL_PSEUDO; + break; + case 3: + value = CPX_VARSEL_STRONG; + break; + case 4: + value = CPX_VARSEL_PSEUDOREDUCED; + break; + default: + warning("Unknown variable selection strategy %d, using default\n", + value); + value = CPX_VARSEL_DEFAULT; + } + status = CPXsetintparam(env, CPX_PARAM_VARSEL, value); + } + else if(strcmp(cur_parm, "flowcovers") == 0) { + status = CPXsetintparam(env, CPX_PARAM_FLOWCOVERS, + *INTEGER(VECTOR_ELT(control, i))); + } + else if(strcmp(cur_parm, "mipstart") == 0) { + } + else if(strcmp(cur_parm, "solnpoolagap") == 0){ + /* solution pool parameters */ + #if CPX_VERSION >= 1100 + status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLAGAP, + *REAL(VECTOR_ELT(control, i))); + #endif + } + else if(strcmp(cur_parm, "solnpoolgap") == 0){ + /* solution pool parameters */ + #if CPX_VERSION >= 1100 + status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLGAP, + *REAL(VECTOR_ELT(control, i))); + #endif + } + else if(strcmp(cur_parm, "solnpoolintensity") == 0){ + #if CPX_VERSION >= 1100 + status = CPXsetintparam(env, CPX_PARAM_SOLNPOOLINTENSITY, + *INTEGER(VECTOR_ELT(control, i))); + #endif + } + else if (strcmp(cur_parm, "breaksymmetry") == 0){ /*ADDED BY XT*/ + #if CPX_VERSION >= 1100 + status = CPXsetintparam(env, CPX_PARAM_SYMMETRY, + *INTEGER(VECTOR_ELT(control, i))); + #endif + } else if (strcmp(cur_parm, "tuning.display") == 0){ + #if CPX_VERSION >= 1100 + status = CPXsetintparam(env, CPX_PARAM_TUNINGDISPLAY, + *INTEGER(VECTOR_ELT(control, i))); + #endif + } else if (strcmp(cur_parm, "tuning.rep") == 0){ + #if CPX_VERSION >= 1100 + status = CPXsetintparam(env, CPX_PARAM_TUNINGREPEAT, + *INTEGER(VECTOR_ELT(control, i))); + #endif + } else if (strcmp(cur_parm, "tuning.tilim") == 0){ + #if CPX_VERSION >= 1100 + status = CPXsetdblparam(env, CPX_PARAM_TUNINGTILIM, + *REAL(VECTOR_ELT(control, i))); + #endif + } + else { + /* If parameter not known print a warning */ + warning("Unknown CPLEX parameter %s. Ignoring it.\n", cur_parm); + } + + if (status) + my_error(("Failure to set parameter %s, error %d.\n", cur_parm, status)); + } + + UNPROTECT(1); +} + +/* ADDED BY MARCIN!! + * + */ +void setstarts(CPXENVptr env, CPXLPptr lp, SEXP control, int isMIP) { + int i, status, value; + const char *cur_parm; + SEXP names; + + /* get list names */ + PROTECT(names = getAttrib(control, R_NamesSymbol)); + + status = 1; /* avoid warning */ + + /* for each element in 'control' try to set the corresponding + parameter */ + if (isMIP) { + for (i = 0; i < length(control); i++) { + + cur_parm = CHAR(STRING_ELT(names, i)); + + if(strcmp(cur_parm, "mipstart") == 0) { + SEXP values_vec = VECTOR_ELT(control, i); + double* values = REAL(values_vec); + int mcnt = 1; + int beg = 0; + int nzcnt = LENGTH(values_vec); + SEXP tmp = allocVector(INTSXP, nzcnt); + int *varindices = INTEGER(tmp); + int effortlevel = CPX_MIPSTART_AUTO; + char *mipstartname = "start"; + + for (int i = 0; i < nzcnt; i++) { + varindices[i] = i; + } + + status = CPXaddmipstarts(env, lp, mcnt, nzcnt, &beg, varindices, values, &effortlevel, &mipstartname); + + if (status) + my_error(("Failure to set parameter %s in setstarts, error %d.\n", cur_parm, status)); + } + } + } + + UNPROTECT(1); +} diff --git a/src/Rcplex_utils.c b/src/Rcplex_utils.c new file mode 100644 index 00000000..b71f1a91 --- /dev/null +++ b/src/Rcplex_utils.c @@ -0,0 +1,102 @@ +// Utility functions like CPLEX initialization/closing routines and +// list accessors + +#include "Rcplex2.h" +#include + +CPXENVptr env; +CPXLPptr lp; +int numcalls; +int forceCplxClose; + +void Rcplex_wait (int seconds) { + clock_t endwait; + endwait = clock() + seconds * CLOCKS_PER_SEC; + while (clock() < endwait) {} +} + +void Rcplex_init(void) { + int numtries = 10; + int status; + char errmsg[1024]; + + /* Initialize CPLEX environment */ + if (env == NULL) { + env = CPXopenCPLEX (&status); + while(env == NULL && numtries > 0) { + Rcplex_wait(30); + numtries--; + } + if (env == NULL) { + CPXgeterrorstring(env, status, errmsg); + error("Could not open CPLEX environment.\n%s\n",errmsg); + } + REprintf("CPLEX environment opened\n"); + numcalls = max_numcalls; + } + else { + numcalls--; + } + forceCplxClose = 0; +} + +void Rcplex_close(void) { + forceCplxClose = 1; + Rcplex_free(); +} + +void Rcplex_free(void) { + int status1, status2; + char errmsg[1024]; + + status1 = status2 = 0; + if (lp != NULL) { + status1 = CPXfreeprob(env,&lp); + /* REprintf("Freed CPLEX problem\n");*/ + lp = NULL; + } + + if (env != NULL && (numcalls == 0 || forceCplxClose)) { + status2 = CPXcloseCPLEX(&env); + REprintf("Closed CPLEX environment\n"); + env = NULL; + } + + if (status1 || status2) { + status2 ? strcpy(errmsg,"env close ok") : CPXgeterrorstring(env,status2,errmsg); + error("Rcplex_free failed: free problem code: %d\nClose environment msg: %s\n",status1,errmsg); + } + return; +} + +SEXP getListElement(SEXP list, char *str) { + SEXP element = R_NilValue, names = getAttrib(list, R_NamesSymbol); + int i; + + for (i=0; i < length(list); i++) { + if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) { + element = VECTOR_ELT(list,i); + break; + } + } + return element; +} + +SEXP setListElement(SEXP list, char* str, double value) { + /* SEXP element = R_NilValue, */ + SEXP out = allocVector(VECSXP, length(list)); + SEXP names = getAttrib(list, R_NamesSymbol); + + setAttrib(out, R_NamesSymbol, names); + int i; + + for (i=0; i < length(list); i++) { + if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) { + /* element = VECTOR_ELT(list,i); */ + SET_VECTOR_ELT(out, i, ScalarReal(value)); + } else { + SET_VECTOR_ELT(out, i, VECTOR_ELT(list, i)); + } + } + return out; +} From 403831289f85c295f37878c0ab256e2323d317e0 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 3 May 2021 19:17:00 -0400 Subject: [PATCH 09/35] remove Rcplex dependency --- R/apps.R | 38 +- R/apps.txt | 1856 ---------------------------------------------------- R/gGnome.R | 37 +- 3 files changed, 36 insertions(+), 1895 deletions(-) delete mode 100644 R/apps.txt diff --git a/R/apps.R b/R/apps.R index 08aca630..526415d2 100644 --- a/R/apps.R +++ b/R/apps.R @@ -1300,7 +1300,6 @@ balance = function(gg, ub = vars$ub control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1) - ## sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control) if (gurobi) { sol = run.gurobi( @@ -2846,21 +2845,20 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL ## ub = rep(ub, length.out = pmax(len(lb), len(ub))) ## } ## TODO: implement lb and ub of walk CNs - sol = Rcplex::Rcplex( - cvec = c, - Amat = A, - bvec = b, - sense = sense, - Qmat = NULL, - lb = lb, - ub = ub, - n = n.sol, - objsense = "min", - vtype = vtype, - control = list( - trace = ifelse(verbose >= 1, 1, 0), - tilim = 100, - epgap = 1)) + sol = Rcplex2(cvec = c, + Amat = A, + bvec = b, + sense = sense, + Qmat = NULL, + lb = lb, + ub = ub, + n = n.sol, + objsense = "min", + vtype = vtype, + control = list( + trace = ifelse(verbose >= 1, 1, 0), + tilim = 100, + epgap = 1)) if (!is.null(sol$xopt)) { sol = list(sol) @@ -2878,10 +2876,10 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL Ahat = rbind(A, P) bhat = c(b, p) sensehat = c(sense, rep("L", length(p))) - sol.new = Rcplex::Rcplex(cvec = c, Amat = Ahat, bvec = bhat, - sense = sensehat, Qmat = NULL, lb = lb, ub = ub, - n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= - 1, 1, 0), tilim = 100, epgap = 1)) + sol.new = Rcplex2(cvec = c, Amat = Ahat, bvec = bhat, + sense = sensehat, Qmat = NULL, lb = lb, ub = ub, + n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= + 1, 1, 0), tilim = 100, epgap = 1)) if (length(sol.new) == 0) { rerun = F } diff --git a/R/apps.txt b/R/apps.txt deleted file mode 100644 index 21c7f517..00000000 --- a/R/apps.txt +++ /dev/null @@ -1,1856 +0,0 @@ -#' applications of gGraph - -#' @name balance -#' @title balance gGnome graphs -#' @description -#' -#' Here we analyze gGraphs with "cn" (copy number) field to enforce integer -#' cn and junction balance, ie sum of incoming (or outgoing) edge -#' cn should be equal to node copy cn. -#' -#' The goal is to find a balaned assignment of "cn" to the nodes and edges of the gGraph -#' that maximally resemble the input weights while minimizing the loose end penalty. -#' The similarity / distance function can be weighted by optional node / edge -#' metadata field $weight (when weighted = TRUE). -#' -#' To output this gGraph, we design a MIP with -#' (1) objective function that minimizes (weighted sum of square) distance of fit node and junction copy number to provided values in -#' $cn field -#' (2) and lambda* the sum of copy number at non terminal loose ends subject to -#' (3) junction balance constraint -#' (4) fixing copy number of a subset of nodes and junctions -#' -#' Objective weight can be modulated at nodes and edges with $weight metadata -#' field (default node weight is node width, and edge weight is 1). -#' These fields will then set the penalty incurred to a fit of x to that node / edge -#' with copy number c and weight w as (x-c)^2/w. -#' -#' Lambda can be modulated at nodes with $lambda node metadata field (default 1) -#' -#' For "haplographs" ie graphs that have more than one node overlapping a given location, it may -#' be important to constrain total copy number using a haploid read depth signal. -#' The marginal parameter enables this through a GRanges annotated with $cn and optionally $weight -#' field that provides a target total copy number that the optimization will attempt to satisfy. -#' This provided copy number c and weight w (default 1) will be evaluated against the -#' sum s of the fit copy numbers of all nodes overlapping that location by adding a penalty -#' of (c-s)^2/w to the corresponding solution. marginal can also have an optional logical field -#' $fix that will actually constrain the marginal copy number to be equal to the provided value -#' (note: that the optimization may be infeasible, and function will error out) -#' -#' Additional controls can be inputted by changing the graph metadata - e.g. adding fields -#' $lb and $ub to nodes and edges will constrain their fit copy number to those bounds. -#' Adding $reward field to edges will add a reward for each copy of that edge in the solution. -#' -#' -#' @param gg gGraph with field $cn, can be NA for some nodes and edges, optional field $weight which will adjust the quadratic penalty on the fit to x as (x-$cn)^2/weight -#' @param lambda positive number specifying default loose end penalty (100), note if gg$node metadata contain $lambda field then this lambda will be multiplied by the node level lambda -#' @param marginal GRanges with field $cn and optional $weight field will be used to fit the summed values at each base of the genome to optimally fit the marginal value, optional field $fix will actually constrain the marginal to be the provided value -#' @param tight indices or epxression on node metadata specifying at which nodes to disallow loose ensd -#' @param nfix indices or expression on node metadata specifying which node cn to fix -#' @param efix indices or expression on edge metadata specifying which edge cn to fix -#' @param nrelax indices or expression on node metadata specifying which nodes cn to relax -#' @param erelax indices or expression on edge metadata specifying which edges cn to relax -#' @param L0 flag whether to apply loose end penalty as L1 (TRUE) -#' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) -#' @param phased (bool) indicates whether to run phased/unphased. default = FALSE -#' @param M big M constraint for L0 norm loose end penalty, should be >1000 -#' @param verbose integer scalar specifying whether to do verbose output, value 2 will spit out MIP (1) -#' @param tilim time limit on MIP in seconds (10) -#' @param epgap relative optimality gap threshhold between 0 and 1 (0.01) -#' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda. -#' @author Marcin Imielinski -#' @export -balance = function(gg, - lambda = 0.1, - marginal = NULL, - tight = NULL, - nfix = NULL, efix = NULL, nrelax = NULL, erelax = NULL, - L0 = TRUE, - loose.collapse = FALSE, - M = 1e2, - phased = FALSE, - verbose = 1, - tilim = 10, - epgap = 0.01) -{ - if (!('cn' %in% names(gg$nodes$dt))) - { - warning('cn field not defined on nodes, setting to NA') - gg$nodes$mark(cn = NA_real_) - } - - if (!('cn' %in% names(gg$edges$dt))) - { - warning('cn not defined on edges, providing NA') - gg$edges$mark(cn = NA_real_) - } - - gg = gg$copy - - ## default local lambda lambda is node width - if (!('lambda' %in% names(gg$nodes$dt))) - gg$nodes$mark(lambda = 1) -# gg$nodes$mark(lambda = width(gg$nodes$gr)) - - ## default node weight is its width - if (!('weight' %in% names(gg$nodes$dt))) - { - gg$nodes$mark(weight = width(gg$nodes$gr)) - } - - ## default edge weight is its width - if (!('weight' %in% names(gg$edges$dt))) - { - gg$edges$mark(weight = 1) - } - - ## default reward is 0 - if (!('reward' %in% names(gg$edges$dt))) - { - gg$edges$mark(reward = 0) - } - - ## handle parsing of efix, nfix, nrelax, erelax - if (!any(deparse(substitute(nfix)) == "NULL")) ## R voodo to allow "with" style evaluation - nfix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nfix)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) - - if (!any(deparse(substitute(nrelax)) == "NULL")) ## R voodo to allow "with" style evaluation - nrelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nrelax)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) - - if (!any(deparse(substitute(efix)) == "NULL")) ## R voodo to allow "with" style evaluation - efix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(efix)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL) - - if (!any(deparse(substitute(erelax)) == "NULL")) ## R voodo to allow "with" style evaluation - erelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(erelax)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL) - - if (!any(deparse(substitute(tight)) == "NULL")) ## R voodo to allow "with" style evaluation - tight = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(tight)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) - - - if (is.logical(nfix)) - nfix = which(nfix) - - if (is.logical(efix)) - efix = which(efix) - - if (is.logical(nrelax)) - nrelax = which(nrelax) - - if (is.logical(erelax)) - erelax = which(erelax) - - if (length(nfix) & verbose) - message('Fixing ', length(nfix), ' nodes') - - if (length(efix) & verbose) - message('Fixing ', length(efix), ' edges') - - if (length(nrelax) & verbose) - message('Relaxing ', length(nrelax), ' nodes') - - gg$nodes[nrelax]$mark(weight = 0) - - if (length(erelax) & verbose) - message('Relaxing ', length(erelax), ' edges') - - gg$nodes[erelax]$mark(weight = 0) - - if (!is.logical(tight)) - tight = 1:length(gg$nodes) %in% tight - - if (any(tight) & verbose) - message('Leaving ', sum(tight), ' nodes tight') - - gg$nodes$mark(tight = tight) - - if (is.null(gg$nodes$dt$lb)) - gg$nodes$mark(lb = 0) - - if (is.null(gg$nodes$dt$ub)) - gg$nodes$mark(ub = Inf) - - if (is.null(gg$edges$dt$lb)) - gg$edges$mark(lb = 0) - - if (is.null(gg$edges$dt$ub)) - gg$edges$mark(ub = Inf) - - if (loose.collapse) - { - if (verbose) - message('Collapsing loose ends') - - uleft = unique(gr.start(gg$nodes$gr)) - uright = unique(gr.end(gg$nodes$gr)) - - gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l')) - gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r')) - } - else - { - gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l')) - gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r')) - } - - ######## - ## VARIABLES - ######## - - ## create state space, keeping track of graph ids - vars = rbind( - gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes - gg$sedgesdt[, .(from, to, lb, ub, sedge.id, cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges - - ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE - ## will be defined on the basis of coordinate overlap) - gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, - ulid = paste0(index, 'i'), - lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)), - type = 'loose.in', vtype = 'I')], ## incoming loose ends - gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, - ulid = paste0(index, 'o'), - lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)), - type = 'loose.out', vtype = 'I')], ## outgoing loose ends - gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual - gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual - fill = TRUE) - - if (phased) { - if (verbose) { - message("adding indicator variables for edge CN") - } - - ## add og.edge.id information for each edge - sedge.to.og.dt = gg$edges$dt[, - .(sedge.id, og.edge.id, ## map sedge.id to og.edge.id - ref.or.alt = type, ## get REF or ALT annotations (important for constraints) - connection) ## get straight/cross annotations - ] - setkey(sedge.to.og.dt, "sedge.id") - - ## add binary indicator variables for each edge - edge.indicator.vars = vars[type == "edge",][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id] - setkey(edge.indicator.vars, "sedge.id") - - ## use sedge.id as a key to join with edge metadata - edge.indicator.vars = edge.indicator.vars[sedge.to.og.dt] - - if (verbose) { - message("adding indicator sum variables for edge CN") - } - - edge.indicator.sum.vars = vars[type == "edge",][, type := "edge.indicator.sum"][, vtype := "I"] - setkey(edge.indicator.sum.vars, "sedge.id") - - ## make sure there is one edge indicator sum variable for each og.edge.id - edge.indicator.sum.vars = edge.indicator.sum.vars[sedge.to.og.dt] - edge.indicator.sum.vars[, gid := og.edge.id] - - ## add one indicator sum variable per og edge ID to vars table - vars = rbind(vars, edge.indicator.vars, fill = TRUE) - vars = rbind(vars, - unique(edge.indicator.sum.vars, by = "gid"), - fill = TRUE) ## fill is TRUE because og.edge.id and ref.or.alt added - - if (verbose) { - message("adding major/minor allele CN and og.node.id to vars") - } - - ## idea here is to add variables that force major allele CN to be at least as large as minor CN - - ## create data table where keys are node ids, and og.node.id/allele can be easily found - snode.to.og.dt = gg$nodes$dt[, .(snode.id, og.node.id, allele)] - setkey(snode.to.og.dt, "snode.id") - - ## merge og.node.id and allele information into vars - vars = merge(vars, snode.to.og.dt, by="snode.id", all.x = TRUE) - } - - if (L0) - { - ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true - ## (i.e. we need indicator.sum and indicator.sum.indicator - vars = rbind(vars, - rbind( - vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid], - vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid] - )) - - if (loose.collapse) - { - ## sum will sum all the loose ends assocaited with the same lid - vars = rbind(vars, - unique(rbind( - vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid], - vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid] - ), by = 'gid')) - - ## sum.indicator is an binary indicator on the sum - vars = rbind(vars, - rbind( - vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid], - vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid] - )) - } - } - - setkeyv(vars, c('type', 'gid')) - - ## add marginal copy number residual if specified - vars$mfix = NA - if (!is.null(marginal)) - { - if (verbose) { - message("adding marginal CN variables") - } - - if (!inherits(marginal, 'GRanges') || is.null(marginal$cn)) - { - stop('marginal must be a GRanges with field $cn') - } - - if (is.null(marginal$weight)) - marginal$weight = 1 - - if (is.null(marginal$fix)) - marginal$fix = FALSE - - ## first disjoin marginal against the nodes - ## ie wee ned to create a separate residual variable for every unique - ## disjoint overlap of marginal with the nodes - dmarginal = gg$nodes$gr %>% gr.stripstrand %*% grbind(marginal %>% gr.stripstrand) %>% disjoin %$% marginal[, c('cn', 'weight', 'fix')] %Q% (!is.na(cn)) %Q% (!is.na(weight)) %Q% (!is.infinite(weight)) - - vars = rbind(vars, - gr2dt(dmarginal)[, .(cn, weight, mfix = fix>0, rid = 1:.N, type = 'mresidual', vtype = 'C')], - fill = TRUE - ) - message("Done adding marginal vars") - } - - vars[, id := 1:.N] ## set id in the optimization - vars[is.na(lb), lb := -Inf] - vars[is.na(ub), ub := Inf] - vars[, relax := FALSE][, fix := FALSE] - vars[type == 'mresidual' & mfix == TRUE, ":="(lb = 0, ub = 0)] - vars[type %in% c('node', 'edge'), lb := pmax(lb, 0, na.rm = TRUE)] - vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)] - vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)] - - - ## figure out junctions and nodes to fix - vars[!is.na(cn) & type == 'node' & abs(snode.id) %in% nfix, ":="(lb = cn, ub = cn, fix = TRUE)] - vars[!is.na(cn) & type == 'edge' & abs(sedge.id) %in% efix, ":="(lb = cn, ub = cn, fix = TRUE)] - - ## figure out terminal node sides for in and out loose ends - ## these will not have loose ends penalized - qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips - term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome - -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end - term.out = -term.in - vars$terminal = FALSE - vars[type %in% c('loose.in', 'loose.in.indicator') & snode.id %in% term.in, terminal := TRUE] - vars[type %in% c('loose.out', 'loose.out.indicator') & snode.id %in% term.out, terminal := TRUE] - - ######## - ## CONSTRAINTS - ## the key principle behind this "melted" form of constraint building is the cid - ## (constraint id) which is the key that will group coefficients into constraints - ## when we finally build the matrices. So all we need to do is make sure that - ## that value / cid pairs make sense and that every cid has an entry in b - ######## - - ## we need one junction balance constraint per loose end - - ## constraints indexed by cid - constraints = rbind( - vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('in', to))], - vars[type == 'node', .(value = -1, id, cid = paste('in', gid))], - vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('out', from))], - vars[type == 'node', .(value = -1, id, cid = paste('out', gid))], - fill = TRUE) - - b = rbind( - vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))], - vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))], - fill = TRUE) - - ## add to the constraints the definitions of the node and edge - ## residuals - constraints = rbind( - constraints, - rbind( - vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))], - vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))], - vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))], - fill = TRUE) - ) - - b = rbind(b, - vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))], - vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))], - fill = TRUE) - - ## add the reverse complement equality constraints on nodes and edges - constraints = rbind( - constraints, - rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id) - vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))], - vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))], - fill = TRUE) - ) - - b = rbind(b, - vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))], - vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))], - fill = TRUE) - - if (phased) - { - #'######################### - ## add constraints forcing major CN to be larger than minor CN - #'######################### - if (!("allele" %in% colnames(vars)) | !("og.node.id" %in% colnames(vars))) { - stop("allele field needs to be added to vars") - } - - ## major allele coefficient is 1, minor allele coefficient is -1 - ## make sure that there's only one per node.id (abs of snode.id) - allele.constraints = rbind( - unique( - vars[type == "node" & allele == "major", - .(value = 1, id, node.id = abs(snode.id), - cid = paste("allele.constraint", og.node.id))], - by = "node.id"), - unique( - vars[type == "node" & allele == "minor", - .(value = -1, id, node.id = abs(snode.id), - cid = paste("allele.constraint", og.node.id))], - by = "node.id"), - fill = TRUE) - - ## add these constraints - constraints = rbind(constraints, - allele.constraints[, .(value, id, cid)], - fill = TRUE) - - ## RHS: force (major CN - minor CN) to be >= 0 - allele.rhs = unique( - vars[type == "node", - .(value = 0, sense = "G", cid = paste("allele.constraint", og.node.id))], - by = "cid") - - b = rbind(b, allele.rhs, fill = TRUE) - - #'######################### - ## add constraints that force indicators to be 1 if edge CN > 0 - ## TODO: fix this so that only one indicator per edge.id (instead of one per sedge.id) - #'######################### - - ## add constraints for upper bound (same setup as L0 penalty) - one per edge - iconstraints = vars[type == "edge", .(value = 1, id, - sedge.id, - cid = paste("edge.indicator.ub", sedge.id))] - - ## add matching indicator variables, matching by cid - iconstraints = rbind( - iconstraints, - vars[type == "edge.indicator", ][ - sedge.id %in% iconstraints$sedge.id, .(value = -M, id, cid = iconstraints$cid, sedge.id)], - fill = TRUE) - - ## upper bound is M if indicator is positive, and zero otherwise - ## we may want to change this later to JaBbA CN instead of (potentially large) M for stability? - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## add the RHS of this constraint (upper bound) - b = rbind( - b, - vars[type == "edge", .(value = 0, sense = "L", cid = paste("edge.indicator.ub", sedge.id))], - fill = TRUE - ) - - ## add constraints for the lower bound - iconstraints = vars[type == "edge", .(value = 1, id, sedge.id, cid = paste("edge.indicator.lb", sedge.id))] - - ## add matching indicator variables for LB - iconstraints = rbind( - iconstraints, - vars[type == "edge.indicator", ][ - sedge.id %in% iconstraints$sedge.id, .(value = -0.1, id, cid = iconstraints$cid, sedge.id)], - fill = TRUE) - - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## add the RHS of this constraint (upper bound) - b = rbind( - b, - vars[type == "edge", .(value = 0, sense = "G", cid = paste("edge.indicator.lb", sedge.id))], - fill = TRUE - ) - - ################### - ## add the edge indicator sum constraints - ################### - - ## ALT edges: only one of four edges can have nonzero CN - ## set upper bound (no need to set lower bound because these are binary variables and cannot be negative) - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", - .(value = 1, id, og.edge.id, - edge.id = abs(sedge.id), - cid = paste("edge.indicator.sum.ub", og.edge.id))], - by = "edge.id" - ) - - constraints = rbind( - constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) - - b = rbind(b, - vars[type == "edge.indicator.sum" & ref.or.alt == "ALT", - .(value = 1, sense = "L", - cid = paste("edge.indicator.sum.ub", og.edge.id))], - fill = TRUE) - - - - ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", - .(value = 1, id, og.edge.id, - edge.id = abs(sedge.id), - cid = paste("edge.indicator.sum.lb", og.edge.id))], - by = "edge.id" - ) - - constraints = rbind( - constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) - - b = rbind(b, - vars[type == "edge.indicator.sum" & ref.or.alt == "ALT", - .(value = 1, sense = "G", - cid = paste("edge.indicator.sum.lb", og.edge.id))], - fill = TRUE) - - ## REF edges: up to two of four edges can have nonzero CN (easiest to implement...) - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, - edge.id = abs(sedge.id), - cid = paste("edge.indicator.sum.ub", og.edge.id))], - by = "edge.id" - ) - - constraints = rbind( - constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) - - b = rbind(b, - vars[type == "edge.indicator.sum" & ref.or.alt == "REF", - .(value = 2, sense = "L", - cid = paste("edge.indicator.sum.ub", og.edge.id))], - fill = TRUE) - } - - - - if (L0) ## add "big M" constraints - { - ## indicator constraints ie on ulids - iconstraints = rbind( - vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))], - vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ - match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## upper bound sense is 'L' i.e. less than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))], - vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))], - fill = TRUE) - - ## lower bound 0.1 if indicator positive, 0 otherwise - iconstraints = rbind( - vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))], - vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ - match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## lower bound sense is 'G' i.e. greater than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))], - vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))], - fill = TRUE) - - if (loose.collapse) - { - ################## - ## loose indicator sum = sum of indicators - ################## - iconstraints = rbind( - vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))], - vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))], - fill = TRUE) - - ## indicator sum is the sum of all indicators mapping to that loose end - iconstraints = rbind( - iconstraints, - unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ - match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'), - fill = TRUE) - - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))], - fill = TRUE) - - ################## - ## now we make new indicator variables on the sum of the individual loose end indicators - ## upper bound bound 0.1 if indicator positive, 0 otherwise - ################## - - iconstraints = rbind( - vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))], - vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][ - match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## upper bound sense is 'L' i.e. less than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))], - fill = TRUE) - - ## lower bound 0.1 if indicator positive, 0 otherwise - iconstraints = rbind( - vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))], - vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ - match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - ## lower bound sense is 'G' i.e. greater than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))], - fill = TRUE) - - } - } - - if (!is.null(marginal) && length(dmarginal)) - { - ## match against nodes and store query.id as rid - ## this will be the constraint id that will allow us - ## to sum the appropriate nodes to constrain to the residual - ov = dmarginal[, c('cn', 'weight')] %*% gg$nodes$gr %>% gr2dt - ov[, rid := query.id] - - constraints = rbind( - constraints, - rbind( - ## match up vars and marginal by snode.id and populate coefficients - merge(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))], - ## the residual is the difference between the sum and marginal cn - vars[type == 'mresidual' & rid %in% ov$rid, .(value = -1, id, cid = paste('mresidual', rid))], - fill = TRUE), - fill = TRUE - ) - - b = rbind(b, - vars[type == 'mresidual' & rid %in% ov$rid, .(value = cn, sense = 'E', cid = paste('mresidual', rid))], - fill = TRUE) - } - - ######## - ## MAKE MATRICES - ######## - - ## now Rcplex time - ## remove any rows with b = NA - - b = b[!is.na(value), ] - constraints = constraints[cid %in% b$cid, ] - - ## convert constraints to integers - ucid = unique(b$cid) - b[, cid.char := cid] - b[, cid := cid %>% factor(ucid) %>% as.integer] - constraints[, cid.char := cid] - constraints[, cid := cid %>% factor(ucid) %>% as.integer] - setkey(b, cid) - - ## create constraint matrix, Qmat, and cobj, lb, ub from vars and constraints lambda = 10 - Amat = sparseMatrix(constraints$cid, constraints$id, x = constraints$value, dims = c(length(ucid), nrow(vars))) - vars[is.na(weight), weight := 0] - - if (any(ix <- is.infinite(vars$weight))) - { - warning('nodes with infinite weight, setting to 0, please check inputs') - vars[ix, weight := 0] - } - Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix') - - ## set lambda to 0 at terminal or other non NA nodes - vars[is.na(lambda), lambda := 0] - - - ## set cvec by multiplying global lambda by local lambda for non-terminal loose end - ## vars (or their indicators if L0 is TRUE) - if (L0) - { - if (loose.collapse) - { - cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) - } - else - { - cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) - } - } - else - cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric) - - ## implement reward if provided - if (length(ix <- which(vars$reward!=0))) - { - if (verbose) - message('Applying reward') - cvec[ix] = -vars$reward[ix] - } - - lb = vars$lb - ub = vars$ub - bvec = b[.(1:nrow(Amat)), value] - sense = b[.(1:nrow(Amat)), sense] - - control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1) - sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control) - vars$cvec = cvec - vars$x = sol$x - - ## for debugging - ppc = function(x) (x %>% merge(vars, by = 'id') %>% merge(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] - - ppv = function(x) {tmp = x %>% merge(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} - - .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))), - sense) - chk = .check(sol$x) - - if (any(is.na(sol$x))) - stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output') - - if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] | - chk[sense == 'G', any(obs < 0, na.rm = TRUE)] | - chk[sense == 'L', any(obs > 0, na.rm = TRUE)]) - stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter') - - ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x - - - ## update graph - nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))] - emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))] - - loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id] - looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id] - - nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings - nodes$loose.left = loosei$cn>0 - - nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings - nodes$loose.right = looseo$cn>0 - - gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0) - gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn) - gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn) - - ## cache old cn values - gg$nodes$mark(cn.old = gg$nodes$dt$cn) - gg$edges$mark(cn.old = gg$edges$dt$cn) - gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue - gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue - gg$nodes[nmark$nid]$mark(cn = nmark$cn) - gg$edges[emark$eid]$mark(cn = emark$cn) - gg$set(y.field = 'cn') - - gg$set(obj = sol$obj) - -## fix loose ends - nodes = gg$nodes - nodes$loose.left = nodes$dt$loose.cn.left>0 - nodes$loose.right = nodes$dt$loose.cn.right>0 - - return(gg) -} - - -#' @name nodestats -#' @description nodestats -#' -#' Computes copy number (CN) "stats" using on graph gg using (binned) copy number data in GRanges data to prep a -#' graph for balance function (see above). -#' -#' The function outputs a graph whose nodes are populated with $cn and $weight fields. The cn field is computed as -#' the centroid (measured by function FUN) and the weight is computed as number of bins / 2 * (sample variance) across -#' the bins overlapping that node. -#' -#' If loess = TRUE is specified, then a mean vs variance curve is computed across all nodes and the sample variance in each node -#' is replaced with the mapping assigned to the centroid -#' -#' The node to bin mapping can be additionally restricted by the columns in "by", which must exist in both the node metadata -#' of gg and data -#' -#' @param gg gGraph to compute node stats across -#' @param data GRanges of cn data to compute node stats against -#' @param cn.field numeric field corresponding to cn data which is used to aggregate (default is first column in the data) -#' @param loess flag whether to use LOESS to compute variance as mean to variance -#' @param FUN function to compute centroids with (mean) -#' @param by by field to match data against node data, the column specified in this field must be present in both the gGraph (NULL) -#' @return gGraph with fields $cn -#' @export -#' @author Marcin Imielinski -nodestats = function(gg, - data, - cn.field = names(values(data))[[1]], - FUN = mean, - loess = FALSE, - by = NULL) -{ - gg = gg$copy - data$cn = values(data)[cn.field] - ov = gr.findoverlaps(gg$nodes$gr, data[, "cn"], scol = 'cn', by = by) %>% gr2dt - dt = ov[, .(mean = FUN(cn, na.rm = TRUE), var = var(cn, na.rm = TRUE), nbins = .N), keyby = query.id][.(1:length(gg$nodes)), ] - dt$weight = dt$nbins/(2*dt$var) - dt[is.infinite(weight), weight := NA] - gg$nodes$mark(cn = dt$mean, weight = dt$weight) - return(gg) -} - - - -#' @name transplant -#' @title transplant donor subggraph into recipient -#' @description -#' -#' Here we transplant a subgraph (eg representing an event) -#' into a recipient graph. The transplant operation will -#' (1) fix the donor ALT junction copy numbers -#' (2) leave the subgraph node copy numbers unconstrained -#' (3) minimize the loose ends of the resulting balanced graph -#' -#' Donor and recipient graph should both have $cn field defined on -#' nodes and edges. Donor can also just be junctions, in which case -#' the junction shadow will be taken as the footprint, or a -#' footprint can be manually provided. The footprint -#' will then provide a region where the copy number constraints -#' are relaxed while loose ends strictly enforced. -#' -#' Currently transplant is only well defined for haploid / unphased graphs ie those -#' where there is at most a single interval representing a given -#' reference genomic regions. -#' -#' @param gg "haploid" gGraph with field $cn -#' @param donor donor (haploid) gGraph or Junction object with field $cn -#' @param footprint if gGraph not provided then footprint can be directly provided -#' @param lambda loose end penalty to apply to balance operation (1000) to nodes within the donor region -#' @param L0 flag whether to use L0 penalty (FALSE) -#' @return balanced gGraph with donor junctions incorporated at their donor cn -#' @author Marcin Imielinski -#' @export -transplant = function(gg, donor, footprint = NULL, lambda = 1000, L0 = FALSE) -{ - if (!('cn' %in% names(gg$nodes$dt) & 'cn' %in% names(gg$edges$dt))) - stop('cn field must be defined on the nodes and edges recipient graph') - - tmp = (gg$nodes$gr %>% gr.sum) - is.haploid = sum(tmp$score>1)==0 - - if (!is.haploid) - stop('transplant is only defined on haploid graphs, please check your graph for overlapping nodes') - - gg = gg$copy - - if (inherits(donor, 'gGraph')) - { - footprint = donor$footprint - junctions = donor$junctions[type == 'ALT'] - gg$disjoin(donor$gr) - } else - { - junctions = donor - if (is.null(footprint)) ## in this case gg is a junction - { - footprint = junctions$shadow - } - gg$disjoin(grbind(junctions$grl %>% unlist, footprint)) - } - - if (length(junctions)>0 && !('cn' %in% names(junctions$dt))) - stop('cn field must be defined on the edges of the donor graph') - - combined.junctions = gg$junctions[type == 'ALT'] - if (length(junctions)) ## mark as donor and combine with gg $junctions - { - junctions$set(donor = TRUE) - combined.junctions = c(combined.junctions, junctions[, c('cn', 'donor')]) - } - - ## mark edges of the donor subgraph as donor = TRUE - gg$edges$mark(donor = FALSE) - - ## make new gGraph combining junctions from recipient and donor - ggn = suppressWarnings(gG(breaks = gg$nodes$gr, - junctions = combined.junctions)) - - - ## fix cn on the edges of the donor subgraph - efix = ggn$edges[type == 'ALT']$dt$edge.id - - fp.nodes = ggn$nodes %&% footprint - - ## relax loose ends everywhere but in the subgraph - this.lambda = lambda - ggn$nodes$mark(lambda = 1) - fp.nodes$mark(lambda = this.lambda) - - ## balance combined gGraph - ggn = balance(ggn, efix = efix, nrelax = fp.nodes$dt$node.id, L0 = L0) - - return(ggn) -} - -#' @name bcheck -#' @title bcheck -#' @description -#' -#' Checks if genome graph with node and edge cn's annotated and loose cn's is balanced. -#' -#' @param gg gGraph with node and edge fields 'cn', node field loose.cn.left and loose.cn.right -bcheck = function(gg) -{ - lmerge = merge(gg$nodes$dt, gg$nodes$eleft$dt, by.x = 'node.id', by.y = 'n2', all.x = TRUE) - rmerge = merge(gg$nodes$dt, gg$nodes$eright$dt, by.x = 'node.id', by.y = 'n1', all.x = TRUE) - - if (!('loose.cn.left' %in% names(lmerge))) - { - lmerge[, loose.cn.left := 0] - } - - if (!('loose.cn.right' %in% names(rmerge))) - { - rmerge[, loose.cn.right := 0] - } - - out = merge(lmerge[, .(left = cn.x[1] - sum(cn.y, na.rm = TRUE) - loose.cn.left[1]), keyby = node.id], - rmerge[, .(right = cn.x[1] - sum(cn.y, na.rm = TRUE) - loose.cn.right[1]), keyby = node.id])[.(gg$nodes$dt$node.id), ] - out[, balanced := left == 0 & right == 0] - - return(out) -} - -#' @name loosefix -#' @title loosefix -#' @description -#' -#' Updates loose.left, loose.right, loose.cn.left, loose.cn.right based on value of cn field -#' -#' @param gg gGraph with node and edge fields 'cn', node field loose.cn.left and loose.cn.right -#' @author Marcin Imielinski -loosefix = function(gg) -{ - lmerge = merge(gg$nodes$dt, gg$nodes$eleft$dt, by.x = 'node.id', by.y = 'n2', all.x = TRUE) - rmerge = merge(gg$nodes$dt, gg$nodes$eright$dt, by.x = 'node.id', by.y = 'n1', all.x = TRUE) - - ## mark any cn NA as 0 - gg$nodes[is.na(cn)]$mark(cn = 0) - gg$edges[is.na(cn)]$mark(cn = 0) - - out = merge(lmerge[, .(loose.cn.left = cn.x[1] - sum(cn.y, na.rm = TRUE)), keyby = node.id], - rmerge[, .(loose.cn.right = cn.x[1] - sum(cn.y, na.rm = TRUE)), keyby = node.id])[.(gg$nodes$dt$node.id), ] - - - out[is.na(loose.cn.left), loose.cn.left := 0] - out[is.na(loose.cn.right), loose.cn.right := 0] - - if (any(ix <- (out$loose.cn.left<0 | out$loose.cn.right<0))) - stop(sprintf('some nodes (%s) have a higher incoming or outgoing edge copy number than the number of node copies', paste(ix %>% which, collapse = ','))) - - gg$nodes$mark(loose.cn.left = out$loose.cn.left, - loose.cn.right = out$loose.cn.right) - - nodes = gg$nodes; - nodes$loose.left = out$loose.cn.left > 0 - nodes$loose.right = out$loose.cn.right > 0 - - return(gg) -} - - -#' @name peel -#' @title peel -#' @description -#' -#' Greedily "peels" walks off a genome graph with node and edge field "cn" -#' by finding successive flows that maximizes some edge metadata field (specified as field) -#' which if not specified will be a logical field type == 'ALT' specifying whether that -#' junction is an ALT edge -#' -#' @param gg gGraph with field cn -#' @param field edge metadata field to use to rank walk solutions (default edge field type == 'ALT')s -#' @param verbose flag = 1 regular verbosity, 2 = dump out Rcplex traces -#' @param embed.loops logical flag (FALSE) if TRUE will embed all the loops in the output into an arbitrary linear (path) walk -#' @author Marcin Imielinski -#' @return collection of gWalks annotated with cn on the original that when added will give the marginal copy profile on inputted nodes and edges -#' @export -peel = function(gg, field = NULL, embed.loops = FALSE, verbose = FALSE) -{ - gg = refresh(loosefix(gg)) - gg.og = refresh(gg) - - ## mini function to compute the cn of a walk in a graph by finding the min cn across its associated - ## nodes, edges, and loose ends - .mincn = function(walks) - { - source = walks$eval(node = snode.id[1]) - sink = walks$eval(node = snode.id[.N]) - - source.loose = walks$eval(node = ifelse(strand[1] == '+', loose.cn.left[1], loose.cn.right[1])) - sink.loose = walks$eval(node = ifelse(strand[.N] == '+', loose.cn.right[.N], loose.cn.left[.N])) - - edge.min = Inf - if (walks$edges %>% length) - edge.min = walks$eval(edge = data.table(cn, id = abs(sedge.id))[, .(CN = cn[1]/.N), by = id][, min(floor(CN), na.rm = TRUE)]) - - node.min = walks$eval(node = data.table(cn, id = abs(snode.id))[, .(CN = cn[1]/.N), by = id][, min(floor(CN), na.rm = TRUE)]) - - pmin( - ifelse(walks$circular, Inf, ## if circular no loose end capacity constraints - ## fold back edge case where we mihgt overestimate our loose end capacity - ifelse(source == -sink, floor(source.loose/2), - pmin(source.loose, sink.loose))), # otherwise we are limited by the loose end cn on each walk side - edge.min, ## and the internal edge cn - node.min ## and the internal node cn - ) - } - - - if (is.null(field)) - { - gg$edges$mark(alt = gg$edges$dt$type == 'ALT') - field = 'alt' - } - - ## .check = function(gw, gg) - ## { - ## gg2 = gW(grl = rep(gw$grl, gw$dt$cn), circular = rep(gw$circular, gw$dt$cn))$graph; - ## gg2$nodes$mark(cn = 1); - ## gg2$edges$mark(cn = 1); - ## gg2 = c(gg$copy, gg2)$disjoin() - ## gg2$nodes$mark(CN = gg2$nodes$dt$cn) - ## return(gg2) - ## } - - out = gW(graph = gg) - - ## loop next batch of max flow walks until no more walks - while (length(walks <- gg$maxflow(walk = TRUE, path.only = FALSE, multi = TRUE, efield = field, cfield = 'cn', verbose = verbose))) - { - ## gg2 = .check(out, gg) - ## cc = gr2dt(gg.og$nodes$gr %*% gg2$nodes$gr)[CN != cn, .(node.id, cn, CN)] - ## ee = merge(gg.og$junctions, gg2$junctions)$dt[cn != cn.1, .(edge.id, edge.id, cn, cn.1)] - ## if (nrow(cc)) - ## browser() - - ## first check for any walks that contain - ## a node from another walk (edge cases involving reverse complements) - walks = walks[rev(order(walks$dt$wid))] - dups = dunlist(walks$snode.id)[, .(count = length(unique(listid)), listid = unique(listid)), - by = .(id = abs(V1))][count>1, ] - - if (walks$edges %>% length) - { - dups = rbind(dups, dunlist(walks$sedge.id)[, .(count = length(unique(listid)), listid = unique(listid)), - by = .(id = paste0('e', abs(V1)))][count>1, ]) - } - - if (nrow(dups)) - { - keep = !(1:length(walks) %in% dups[duplicated(id), listid %>% unique]) - walks = walks[keep] - } - - - ## cn of walk is the min cn of edges, correcting for edges that get hit multiple times - walks$set(cn = .mincn(walks)) - - if (verbose) - message('Peeling off ', length(walks), ' walks with max width ', - max(walks$dt$wid), ' and max copy ', max(walks$dt$cn, na.rm = TRUE)) - - ## add batch to output - out = c(walks, out) - - ## peel off graph - ## annotate walk with min cn - while (length(walks <- walks[cn>0])) - { - ## figure out how much to decrement - ## aggregate in case both strands of node is in a walk -# gg.cache = gGnome::refresh(gg) - - ndec = dunlist(walks$snode.id)[, wcn := walks$dt$cn[listid]][, .(dec = sum(wcn)), keyby = .(nid = abs(V1))] - ndec$cn = gg$nodes[ndec$nid]$dt$cn - ndec$dec - gg$nodes[ndec$nid]$mark(cn = ndec$cn) - - if (walks$edges %>% length) - { - edec = dunlist(walks$sedge.id)[, wcn := walks$dt$cn[listid]][!is.na(V1), .(dec = sum(wcn)), keyby = .(eid = abs(V1))] - edec$cn = gg$edges[edec$eid]$dt$cn - edec$dec - gg$edges[edec$eid]$mark(cn = edec$cn) - } - - ## to recompute walk cn have to take into account edges that get hit more than once - gg = loosefix(gg) - walks$set(cn = .mincn(walks)) -# bc = bcheck(gg)[balanced==FALSE, ] - - ## if (gg$nodes$dt[, any(loose.cn.left<0 | loose.cn.right<0)]) - ## browser() - - ## if (nrow(bc)) - ## browser() - } - - if (verbose) - { - ploidy = gg$nodes$dt[, sum(cn*width, na.rm = TRUE)/sum((1+0*cn)*width, na.rm = TRUE)] - message('... remaining ploidy ', round(ploidy,2), ' across ', sum(gg$nodes$dt$cn>0, na.rm = TRUE), ' nodes and ', sum(gg$edges$dt$cn>0), ' edges with nonzero CN remaining in graph with total ', length(out), ' walks in collection') - } - } - - ## recast walks around original (input) graph - walks = gW(snode.id = out$snode.id, circular = out$circular, meta = out$dt, graph = gg.og) - walks = walks[rev(order(wid))] - - if (embed.loops) - walks = embedloops(walks[circular == TRUE], walks[circular == FALSE], verbose = verbose>1) - - return(walks) -} - - -#' @name embedloops -#' @description embedloops -#' -#' Attempts to embed / tarnsplant a set of circular walks (loops) -#' into a set of recipients, both defined on the same gGraph, and -#' both optionally having the metadata $cn. Returns a gWalk with as -#' many of the loops embedded into the recipients as possible. -#' -#' A loop needs to intersect some node in order to be successfully embedded. -#' (note: that node may be in another loop, if that loop gets embedded) -#' (As a result though, we don't guarantee that every loop will be enbedded) -#' -#' Noter: loops with cn>1 will be embedded in tandem. -#' -#' @param loops circular gWalk object -#' @param recipient set of gWalks defined on the same object -#' @return gWalk with as many loops embedded into recipients as possible -#' @export -#' @author Marcin Imielinski -embedloops = function(loops, recipients, verbose = FALSE) -{ - if (length(loops)==0) - return(recipients) - - if (length(recipients)==0) - return(loops) - - if (!all(loops$circular)) - stop('Loops must be circular walks') - - ## if cn not set then set to 1 - if (is.null(loops$dt$cn)) - loops$set(cn = 1) - - ## if cn not set then set to 1 - if (is.null(recipients$dt$cn)) - recipients$set(cn = 1) - - ## subroutine to embed - .embed = function(donor, recipients) - { - entry = dunlist(recipients$snode.id)[abs(V1) %in% abs(donor$snode.id[[1]]), ][1,] - - if (is.na(entry$V1)) - return(list(embedded = FALSE, recipients = recipients)) - - if (entry$V1 %in% -donor$snode.id[[1]]) - donor = donor[-1] ## reverse complement - - ## pivot donor around candidate node - snid = donor$snode.id[[1]] - ix = match(entry$V1, snid) - snidp = snid[c(ix:length(snid))] - snidp = c(snidp, setdiff(snid, snidp)) - - this.recipient = recipients[entry$listid %>% as.integer] - snidr = this.recipient$snode.id[[1]] - - ix = match(entry$V1, snidr) - - ## insert (several copies) of pivoted donor cycle just upstream of the entry point - snid.new = c(snidr[seq_len(ix-1)], rep(snidp, donor$dt$cn), snidr[ix:length(snidr)]) - - ## make new walk - gw.new = gW(snode.id = list(snid.new), graph = this.recipient$graph, circular = this.recipient$circular, meta = this.recipient$meta) - - ## if recipient walk has more than one copy then we will need to preserve additional - ## "original" copies of this.recipient walk - if (this.recipient$dt$cn>1) - { - this.recipient$set(cn = this.recipient$dt$cn - 1) - gw.new$set(cn = 1) - gw.new = c(gw.new, this.recipient) - } - - recipients = c(recipients[setdiff(1:length(recipients), entry$listid)], gw.new) - - recipients$set(numalt = recipients$eval(edge = sum(type == 'ALT'))) - recipients = recipients[rev(order(numalt))] - - if (verbose) - { - message('Embedded loop ', paste(snidp, collapse = '->'), ' into recipient ', paste(snidr, collapse = '->'), ' giving ', paste(snid.new, collapse = '->')) - } - - return(list(embedded = TRUE, recipients = recipients)) - } - - embedded = done = rep(FALSE, length(loops)) - - while (!all(done)) - { - old.recipients = copy(recipients) - for (i in which(!done)) - { - res = .embed(loops[i], recipients) - embedded[i] = done[i] = res$embedded - recipients = res$recipients - } - - ## we give up if a round of embedding does not change recipients - if (identical(old.recipients$snode.id, recipients$snode.id)) - done = rep(TRUE, length(loops)) - } - - ## return (modified) recipients and any leftover loops - return(c(recipients, loops[which(!embedded)])) -} - -#' @name binstats -#' @title binstats -#' @description -#' -#' Given GRanges of binned eg read depth data with field $cn, crosses -#' nodes in graph and annotates graph nodes with -#' 'cn' and 'weight'. Done upstream of balance. -#' -#' If "by" field(s) specified and these fields exist in both -#' bins and graph nodes, then will use these in the overlaps query -#' -#' If field, purity, and ploidy provided then will -#' also transform read depth data in bin column "field" -#' using purity and ploidy to generate -#' @param gg gGraph -#' @param bins GRanges with field $cn or field field -#' @param by optional character vector specifying metadata field(s) shared by gg$nodes and bins to which bin / node overlaps will be limited to -#' @param field character vector field of bins to convert to (NULL) -#' @param purity purity parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL -#' @param ploidy ploidy parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL -#' @param min.bins minimum number of bins to use for intra segment variance computation (3) -#' @param loess logical flag whether to smooth / fit variance using loess (FALSE) -#' @param min.var minimal allowable per segment bin variance, which will ignore segments with very low variance due to all 0 or other reasons (0.1) -#' @return gGraph whose nodes are annotated with $cn and $weight field -#' @export -#' @author Marcin Imielinski -binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, ploidy = gg$meta$ploidy, loess = TRUE, min.bins = 3, verbose = TRUE, min.var = 0.1) -{ - gg = gg$copy - - if (!is.null(field) & !is.null(purity) & !is.null(ploidy) && is.numeric(purity) && is.numeric(ploidy)) - { - if (verbose) - message('Converting ', field, ' to cn using purity ', purity, ' and ploidy ', ploidy) - - bins$cn = rel2abs(bins, field = field, purity = purity, ploidy = ploidy) - } - - if (is.null(bins$cn)) - stop('bins must have field cn or a field, purity, and ploidy must be specified where field is a column in bins') - - if (verbose) - message('crossing nodes and bins via gr.findoverlaps') - ov = gr.findoverlaps(gg$nodes$gr, bins, by = by, scol = names(values(bins)), return.type = 'data.table') - if (verbose) - message('aggregating bin stats per node') - dt = ov[!is.na(cn), .(mean = mean(cn, na.rm = TRUE), var = var(cn, na.rm = TRUE), nbins = .N), keyby = query.id][.(1:length(gg$nodes)), ] - dt[nbins% rbindlist() - - #' create new gGraph - phased.nodes = c(major.nodes.gr, minor.nodes.gr) - phased.edges = list(major.edges.dt, minor.edges.dt, cross.edges.dt) %>% rbindlist() - phased.gg = gG(nodes = phased.nodes, edges = phased.edges) - - #' update edge colors for plotting - phased.gg$edges[connection == "cross" & type == "REF"]$mark(col = "light blue") - phased.gg$edges[connection == "cross" & type == "ALT"]$mark(col = "pink") - phased.gg$edges[connection == "straight" & type == "REF"]$mark(col = "blue") - phased.gg$edges[connection == "straight" & type == "ALT"]$mark(col = "red") - - #' update node colors for plotting - phased.gg$nodes[allele == "major"]$mark(col = "red") - phased.gg$nodes[allele == "minor"]$mark(col = "blue") - - #' check that bins has required fields for minor.cn and major.cn - if (is.null(bins$major.cn) | is.null(bins$minor.cn)) { - stop("bins must have fields major.cn and minor.cn") - } - - #' overlap major/minor alleles with bins separately - if (verbose) { - message("crossing nodes and bins via gr.findoverlaps") - } - #' prepare bins for finding overlaps by adding allele and cn columns - major.bins = granges(bins[,"major.cn"], use.mcols = TRUE) - minor.bins = granges(bins[,"minor.cn"], use.mcols = TRUE) - names(values(major.bins)) = c("cn") ## change metadat column name to cn - names(values(minor.bins)) = c("cn") - major.bins$allele = "major" - minor.bins$allele = "minor" - ov = gr.findoverlaps(phased.nodes, ## concatenated GRanges for phased gGraph - c(major.bins, minor.bins), ## concatenate major and minor bins - by = c("allele"), ## only find overlaps if alleles field is matching - qcol = c("node.id"), - scol = c("allele", "cn"), - return.type = "data.table") - - #' compute bin stats per node - if (verbose) { - message("aggregating bin stats per node") - } - dt = ov[!is.na(cn), - .(mean = mean(cn, na.rm = TRUE), - var = var(cn, na.rm = TRUE), - nbins = .N), - keyby = node.id] %>% - merge(ov[which(!duplicated(node.id)), .(node.id, allele)], - by = "node.id", - all.y = TRUE) %>% ## right join (since na.rm was true, might be missing some nodes) - .[order(node.id)] ## sort by node id - - #' set variance to NA if number of bins is less than specificied minimum - dt[nbins < min.bins, var := NA] - - #' compute weights (nbins / variance) - ## for now adding a jitter - dt[, ":="(weight = nbins / (2 * var + 1e-2))] - - #' add cn (dt$mean) and weight to phased gGraph - phased.gg$nodes$mark(cn = dt$mean, weight = dt$weight) - - if (any(is.infinite(dt$weight), na.rm = TRUE)) { - warning('variance computation yielded infinite weight, consider setting min.bins higher or using loess fit') - } - - return(phased.gg) -} - - - -#' @name fitcn -#' @title fitcn -#' @author Julie Behr, Xiaotong Yao -#' -#' @param gw input gWalks -#' @param cn.field character column names of each graph's CN data -#' -#' @export -fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NULL, verbose = TRUE, - min.alt = TRUE, edgeonly = FALSE, evolve = FALSE, n.sol = 2, return.gw = TRUE, - sep = "_") -{ - ## gw = self$copy - gw = gw$copy - gg = gw$graph$copy - stopifnot(all(cn.field %in% colnames(gg$nodes$dt)) & all(cn.field %in% colnames(gg$edges$dt))) - ## if (is.null(gw$graph$nodes$dt$cn) | is.null(gw$graph$edges$dt$cn)) { - ## stop("cn field is missing from node and edge metadata") - ## } - gg$nodes$mark(cn = rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE]))) - rcn = sapply(cn.field, function(colnm) gg$nodes$eval(sum(cn))) - lcn = sapply(cn.field, function(colnm) gg$nodes$eval(sum(cn), right = FALSE)) - rcn = gg$nodes$eval(sum(cn)) - lcn = gg$nodes$eval(sum(cn), right = FALSE) - jbal.left = all( - (lcn == rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & !gg$nodes$dt$loose.left) | - (lcn <= rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & gg$nodes$dt$loose.left), - na.rm = TRUE) - jbal.right = all( - (rcn == rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & !gg$nodes$dt$loose.right) | - (rcn <= rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & gg$nodes$dt$loose.right), - na.rm = TRUE) - if (!jbal.left | !jbal.right) - warning("graph does not appear to be junction balanced, please check inputs") - ## helper function to get the problem right - constrain.evolution = function(K, gw, A, b, sense){ - h = K[gw$edges[type == "ALT"]$dt[!duplicated(edge.id), - edge.id], ] - A = rbind(A, cbind(sparseMatrix(1, 1, x = 0, dims = dim(h)), - h)) - b = c(b, rep(1, nrow(h))) - sense = c(sense, rep("L", nrow(h))) - return(list(A = A, b = b, sense = sense)) - } - constrain.observations = function(obs.mat, A, b, cvec, sense, - vtype) { - if (!(ncol(obs.mat) * 2) == ncol(A)) - stop("input obs.mat contains the wrong number of columns; should match length of gw") - p = nrow(obs.mat) - w = ncol(obs.mat) - Zero = sparseMatrix(1, 1, x = 0, dims = c(2 * w * p, - 2 * w * p)) - A0 = Zero[rep(1, nrow(A)), 1:(2 * p)] - Ap = cbind(Zero[rep(1, p), 1:w], sign(obs.mat), diag(rep(-1, - p)), Zero[rep(1, p), 1:p]) - Mpub = cbind(Zero[rep(1, p), 1:(2 * w)], diag(rep(1, - p)), diag(rep(-1e+07, p))) - Mplb = cbind(Zero[rep(1, p), 1:(2 * w)], diag(rep(1, - p)), diag(rep(-0.1, p))) - Amp = rbind(cbind(A, A0), Ap, Mpub, Mplb) - b = c(b, rep(0, 3 * p)) - cvec = c(cvec, rep(0, p), -1 * rowMax(obs.mat)) - sense = c(sense, rep("E", p), rep("L", p), rep("G", p)) - vtype = c(vtype, rep("I", p), rep("B", p)) - return(list(A = Amp, b = b, c = cvec, sense = sense, - vtype = vtype)) - } - generate.Ke = function(gw) { - dt = gw$edgesdt[, c("walk.id", "sedge.id")][, `:=`(edge.id, abs(sedge.id))] - dt$listid = factor(dt$walk.id, 1:length(gw)) - dt$edge.id = factor(dt$edge.id, gg$edgesdt$edge.id) - cdt = dcast(dt[!is.na(sedge.id), ], listid ~ edge.id, - fun.aggregate = length, value.var = "edge.id", drop = FALSE) - mat = cdt[, -1] - rownames(mat) = cdt$listid - return(t(mat)) - } - generate.Kn = function(gw) { - dt = gw$nodesdt[, c("walk.id", "snode.id")][, `:=`(node.id, - abs(snode.id))] - dt$listid = factor(dt$walk.id, 1:length(gw)) - dt$node.id = factor(dt$node.id, gg$nodes$dt$node.id) - cdt = dcast(dt[!is.na(snode.id), ], listid ~ node.id, - fun.aggregate = length, value.var = "node.id", drop = FALSE) - mat = cdt[, -1] - rownames(mat) = cdt$listid - return(t(mat)) - } - generate.Amat = function(K, nblock = 1) { - M = 1e+07 - K = as(K, "sparseMatrix") - w = ncol(K) - Zero = sparseMatrix(1, 1, x = 0, dims = c(2 * w * nblock, 2 * w * nblock)) - Amub = cbind( - do.call(`cbind`, lapply(seq_len(nblock), function(i) diag(rep(1, w)))), diag(rep(-M, w)) - ) - Amlb = cbind( - do.call(`cbind`, lapply(seq_len(nblock), function(i) diag(rep(1, w)))), diag(rep(-0.1, w)) - ) - A = rbind( - ## cbind(K, Zero[rep(1, nrow(K)), (w + 1:w)]), - cbind(Reduce(`diagc`, lapply(seq_len(nblock), function(i) K)), Zero[rep(1, nrow(K) * nblock), (w + 1:w)]), - Amub, - Amlb - ) - return(A) - } - generate.bvec = function(e, K) { - w = ncol(K) - bvec = c(c(e), rep(0, 2 * w)) - return(bvec) - } - generate.cvec = function(K, weight, min.alt, gw, nblock = 1) { - if (!is.null(weight) | min.alt) - weight = prep.weight(K, weight, min.alt, gw) - w = ncol(K) - if (is.null(weight)) - weight = rep(1, w) - cvec = c(rep(0, w * nblock), weight) - return(cvec) - } - prep.weight = function(K, weight, min.alt, gw) { - if (!is.null(weight)) { - if (length(weight) == 1 & is.character(weight) & - weight %in% colnames(gw$dt)) - weight = gw$dt[, weight, with = F] - if (!(is.numeric(weight))) { - stop("weight must either be numeric vector of same length as gw or the name of a single numeric annotation in gw") - } - } - if (min.alt) { - if (!is.null(weight)) { - warning("modifying input weight to satisfy min.alt=TRUE") - } - else weight = rep(1, ncol(K)) - numalt = gw$eval(edge = sum(type == "ALT")) - weight[is.na(numalt) | numalt == 0] = 0 - } - return(weight) - } - generate.vtype = function(K, nblock = 1) { - w = ncol(K) - vtype = c(rep("I", w * nblock), rep("B", w)) - return(vtype) - } - generate.sense = function(K, nblock = 1) { - w = ncol(K) - r = nrow(K) - sense = c(rep("E", r * nblock), rep("L", w), rep("G", w)) - return(sense) - } - diagc = function(mat1, mat2){ - out = matrix(0, nrow = nrow(mat1) + nrow(mat2), ncol = ncol(mat1) + ncol(mat2)) - el1 = which(as.matrix(mat1)!=0, arr.ind = TRUE) - el2 = el2.ori = which(as.matrix(mat2)!=0, arr.ind = TRUE) - el2[, "row"] = el2.ori[, "row"] + nrow(mat1) - el2[, "col"] = el2.ori[, "col"] + ncol(mat1) - out[rbind(el1, el2)] = c(mat1[el1], mat2[el2.ori]) - return(out) - } - ## start building the problem - if (any(!cn.field %in% colnames(gw$graph$edges$dt))) { - stop("cn field must be populated in the input graph node and edges metadata") - ## already stopped, what is this for?? - ## if (trim) { - ## e = rep(1, length(unique(abs(unlist(gw$sedge.id))))) - ## e2 = rep(1, length(unique(abs(unlist(gw$snode.id))))) - ## } - ## else { - ## e = rep(1, length(gw$graph$edges)) - ## e2 = rep(1, length(gw$graph$nodes)) - ## } - } - else { - if (trim) { - e = as.matrix( - gw$graph$edges[sedge.id %in% abs(unlist(gw$sedge.id))]$dt[, cn.field, with = FALSE]) - e2 = as.matrix( - gw$graph$nodes[snode.id %in% abs(unlist(gw$snode.id))]$dt[, cn.field, with = FALSE]) - } - else { - e = as.matrix(gw$graph$edges$dt[, cn.field, with = FALSE]) - e2 = as.matrix(gw$graph$nodes$dt[, cn.field, with = FALSE]) - } - } - if (edgeonly) { - K = generate.Ke(gw) - } - else { - K = rbind(generate.Ke(gw), generate.Kn(gw)) - e = rbind(e, e2) - } - if (nrow(K) != nrow(e)) { - stop("Mismatch between size of A matrix and length of b vector. Some edges in gw$graph are not covered by gw. If this was intended, try trim=TRUE") - } - K.unit = K - K = Reduce(`diagc`, lapply(seq_len(ncol(e)), function(i) K)) - ## a bit of cheating, doing the old procedure column by column as vectors - ## tmp = lapply(seq_len(ncol(e)), function(i){ - ## this.e = e[, i, drop = TRUE] - ## A = generate.Amat(unname(K)) - ## b = generate.bvec(this.e, K) - ## sense = generate.sense(K) - ## return(list(A = A, b = b, sense = sense, vtype = vtype)) - ## }) - A = generate.Amat(unname(K.unit), ncol(e)) - b = generate.bvec(e, K.unit) - sense = generate.sense(K.unit, ncol(e)) - vtype = generate.vtype(K.unit, ncol(e)) - ## restructure the objective function - c = generate.cvec(K.unit, weight, min.alt, gw, ncol(e)) - ## row bind them into final constraints - ## browser() - if (evolve) { - ll = constrain.evolution(K, gw, A, b, sense) - A = ll$A - b = ll$b - sense = ll$sense - } - if (!is.null(obs.mat)) { - ll = constrain.observations(obs.mat, A, b, c, sense, - vtype) - A = ll$A - b = ll$b - c = ll$c - sense = ll$sense - vtype = ll$vtype - } - ## browser() - ## load customized lb and ub if present ## TODO make ub lb specific to samples - if (any(grepl("lb", colnames(gw$dt)))){ - if (identical(grep("lb", colnames(gw$dt), value = TRUE), "lb")){ - lb = c(rep(gw$dt$lb, ncol(e)), rep(0, ncol(K.unit))) - } else { - lb.cols = gsub("cn", "lb", cn.field) - if (!all(lb.cols %in% colnames(gw$dt))){ - lb = 0 - } - lb = do.call("c", c(lapply(lb.cols, function(x) gw$dt[[x]]), rep(0, len(gw)))) - } - } else { - lb = 0 - } - if (any(grepl("ub", colnames(gw$dt)))){ - if (identical(grep("ub", colnames(gw$dt), value = TRUE), "ub")){ - ub = c(rep(gw$dt$ub, ncol(e)), rep(Inf, ncol(K.unit))) - } else { - ub.cols = gsub("cn", "ub", cn.field) - if (!all(ub.cols %in% colnames(gw$dt))){ - ub = Inf - } - ub = do.call("c", c(lapply(ub.cols, function(x) gw$dt[[x]]), rep(Inf, len(gw)))) - } - } else { - ub = Inf - } - ## if (len(lb) != len(ub)){ - ## lb = rep(lb, length.out = pmax(len(lb), len(ub))) - ## ub = rep(ub, length.out = pmax(len(lb), len(ub))) - ## } - ## TODO: implement lb and ub of walk CNs - sol = Rcplex::Rcplex( - cvec = c, - Amat = A, - bvec = b, - sense = sense, - Qmat = NULL, - lb = lb, - ub = ub, - n = n.sol, - objsense = "min", - vtype = vtype, - control = list( - trace = ifelse(verbose >= 1, 1, 0), - tilim = 100, - epgap = 1)) - - if (!is.null(sol$xopt)) { - sol = list(sol) - } - if (length(sol) == 0) { - stop("No solutions found satisfying given constraints") - } - ## what is this rerun for??? - ## to exhaust solutions - rerun = T - while (rerun) { - z = sign(vtype == "B") - P = do.call(rbind, lapply(sol, function(x) x$xopt * z)) - p = rowSums(P) - 1 - Ahat = rbind(A, P) - bhat = c(b, p) - sensehat = c(sense, rep("L", length(p))) - sol.new = Rcplex::Rcplex(cvec = c, Amat = Ahat, bvec = bhat, - sense = sensehat, Qmat = NULL, lb = lb, ub = ub, - n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= - 1, 1, 0), tilim = 100, epgap = 1)) - if (length(sol.new) == 0) { - rerun = F - } - else { - sol = c(sol, sol.new) - if (length(sol) >= n.sol) { - sol = sol[1:n.sol] - rerun = F - } - } - } - ## return(sol) - ## for (cnm in cn.field){ - ## gw$set(eval(paste0("cn.", i)) = xmat[, cnm]) - ## } - ## if (length(sol) > 1) { - - if (return.gw){ - for (i in seq_along(sol)){ - this.sol = sol[[i]] - this.x = this.sol$xopt - this.cnf = rep(c(cn.field, "indicator"), each = len(gw)) - this.ls = split(this.x, this.cnf) - names(this.ls) = paste(names(this.ls), i, sep = sep) - do.call(gw$set, this.ls) - } - return(invisible(gw)) - } else { - return(sol) - } -} diff --git a/R/gGnome.R b/R/gGnome.R index bebddfa1..af540797 100644 --- a/R/gGnome.R +++ b/R/gGnome.R @@ -6712,22 +6712,21 @@ gG = function(genome = NULL, } - #' @name lengths #' @title lengths #' @description #' A vector of walk lengths associated with this walk #' -#' @param gWalk a \code{gWalk} object +#' @param x \code{gWalk} object +#' @param use.names #' #' @return the number of nodes in the gWalk #' @export -`lengths.gWalk` = function(gWalk, use.names = FALSE){ - return(gWalk$lengths) +`lengths.gWalk` = function(gWalk, use.names = TRUE){ + return(gWalk$lengths) } - #' @name length #' @title length #' @description @@ -7964,20 +7963,20 @@ gWalk = R6::R6Class("gWalk", ## GWALKS vtype = ll$vtype } - sol = Rcplex::Rcplex(cvec = c, - Amat = A, - bvec = b, - sense = sense, - Qmat = NULL, - lb = 0, - ub = Inf, - n = n.sol, - objsense = "min", - vtype = vtype, - control = list( - trace = ifelse(verbose>=1, 1, 0), - tilim = 100, - epgap = 1)) + sol = Rcplex2(cvec = c, + Amat = A, + bvec = b, + sense = sense, + Qmat = NULL, + lb = 0, + ub = Inf, + n = n.sol, + objsense = "min", + vtype = vtype, + control = list( + trace = ifelse(verbose>=1, 1, 0), + tilim = 100, + epgap = 1)) if (!is.null(sol$xopt)){ sol = list(sol) From 7ef4cbdacf276d200eccebd2ccc032474dab27b0 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 3 May 2021 19:20:55 -0400 Subject: [PATCH 10/35] remove rcplex dependency --- R/gGnome.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/gGnome.R b/R/gGnome.R index af540797..8dc636ca 100644 --- a/R/gGnome.R +++ b/R/gGnome.R @@ -4807,15 +4807,15 @@ gGraph = R6::R6Class("gGraph", ## cap individual signed edge.ids to their capacity ub[1:nrow(ed)] = pmin(ub[1:nrow(ed)], ed[[cfield]]) } - sol = Rcplex::Rcplex(Amat = Amat, - lb = rep(0, ncol(Amat)), - ub = ub, - bvec = b$bvec, - sense = b$sense, - vtype = 'I', - control = list(trace = verbose > 1), - objsense = ifelse(max, 'max', 'min'), - cvec = cvec) + sol = Rcplex2(cvec = cvec, + Amat = Amat, + bvec = b$bvec, + lb = rep(0, ncol(Amat)), + ub = ub, + sense = b$sense, + vtype = 'I', + control = list(trace = verbose > 1), + objsense = ifelse(max, 'max', 'min')) ## find edges used in the solution opted = ed[round(sol$xopt[1:.N])!=0, ] From f1beeef7208e9392cbd89a8a137de809c1e11648 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 10 May 2021 10:24:24 -0400 Subject: [PATCH 11/35] add telomeric annotation to loose ends --- R/apps.R | 81 ++++++++++++++++---------------------------------------- 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/R/apps.R b/R/apps.R index 526415d2..6289c1fe 100644 --- a/R/apps.R +++ b/R/apps.R @@ -399,6 +399,16 @@ balance = function(gg, } if (ism) { + + ## add telomeric annotation + qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + term.out = -term.in ## out is reciprocal of in + + ## annotate loose indicators with this + vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)] + ## if not phased, must add edge indicators (for just the ALT edges) if (!phased) { edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id) @@ -593,52 +603,6 @@ balance = function(gg, message("adding delta constraints for LP") } - ## ## constrain deltas to be at least zero - ## delta.lbs = rbind( - ## vars[type == "ndelta.minus", .(value = 1, id, cid = paste("ndelta.minus.lb", gid))], - ## vars[type == "ndelta.plus", .(value = 1, id, cid = paste("ndelta.plus.lb", gid))], - ## vars[type == "edelta.minus", .(value = 1, id, cid = paste("edelta.minus.lb", gid))], - ## vars[type == "edelta.plus", .(value = 1, id, cid = paste("edelta.plus.lb", gid))], - ## vars[type == "mdelta.minus", .(value = 1, id, cid = paste("mdelta.minus.lb", gid))], - ## vars[type == "mdelta.plus", .(value = 1, id, cid = paste("mdelta.plus.lb", gid))] - ## ) - - ## delta.lbs.rhs = rbind( - ## vars[type == "ndelta.minus", .(value = 0, sense = "G", cid = paste("ndelta.minus.lb", gid))], - ## vars[type == "ndelta.plus", .(value = 0, sense = "G", cid = paste("ndelta.plus.lb", gid))], - ## vars[type == "edelta.minus", .(value = 0, sense = "G", cid = paste("edelta.minus.lb", gid))], - ## vars[type == "edelta.plus", .(value = 0, sense = "G", cid = paste("edelta.plus.lb", gid))], - ## vars[type == "mdelta.minus", .(value = 0, sense = "G", cid = paste("mdelta.minus.lb", gid))], - ## vars[type == "mdelta.plus", .(value = 0, sense = "G", cid = paste("mdelta.plus.lb", gid))] - ## ) - - - ## constraints = rbind(constraints, delta.lbs, fill = TRUE) - ## b = rbind(b, delta.lbs.rhs, fill = TRUE) - - ## ## add upper bound to prevent problem from becoming unbounded - ## ## constrain deltas to be at least zero - ## delta.ubs = rbind( - ## vars[type == "ndelta.minus", .(value = 1, id, cid = paste("ndelta.minus.ub", gid))], - ## vars[type == "ndelta.plus", .(value = 1, id, cid = paste("ndelta.plus.ub", gid))], - ## vars[type == "edelta.minus", .(value = 1, id, cid = paste("edelta.minus.ub", gid))], - ## vars[type == "edelta.plus", .(value = 1, id, cid = paste("edelta.plus.ub", gid))], - ## vars[type == "mdelta.minus", .(value = 1, id, cid = paste("mdelta.minus.ub", gid))], - ## vars[type == "mdelta.plus", .(value = 1, id, cid = paste("mdelta.plus.ub", gid))] - ## ) - - ## delta.ubs.rhs = rbind( - ## vars[type == "ndelta.minus", .(value = M, sense = "L", cid = paste("ndelta.minus.ub", gid))], - ## vars[type == "ndelta.plus", .(value = M, sense = "L", cid = paste("ndelta.plus.ub", gid))], - ## vars[type == "edelta.minus", .(value = M, sense = "L", cid = paste("edelta.minus.ub", gid))], - ## vars[type == "edelta.plus", .(value = M, sense = "L", cid = paste("edelta.plus.ub", gid))], - ## vars[type == "mdelta.minus", .(value = M, sense = "L", cid = paste("mdelta.minus.ub", gid))], - ## vars[type == "mdelta.plus", .(value = M, sense = "L", cid = paste("mdelta.plus.ub", gid))] - ## ) - - ## constraints = rbind(constraints, delta.ubs, fill = TRUE) - ## b = rbind(b, delta.ubs.rhs, fill = TRUE) - vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)] ## add the residual constraints @@ -802,9 +766,9 @@ balance = function(gg, if (!phased) { ## extremity exclusivity (relevant for ALL graphs) loose.constraints = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1, + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1, + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] ) @@ -825,10 +789,11 @@ balance = function(gg, edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] + ## TODO: add these as ub and lb instead of equality constraints loose.zeros = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] ) @@ -841,9 +806,9 @@ balance = function(gg, if (phased) { ## homologous extremity exclusivity loose.constraints = rbind( - vars[type == "loose.in.indicator" & sign(snode.id)==1, + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], - vars[type == "loose.out.indicator" & sign(snode.id)==1, + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] ) @@ -857,9 +822,9 @@ balance = function(gg, constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) rhs = unique(rbind( - vars[type == "loose.in.indicator" & sign(snode.id)==1, + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], - vars[type == "loose.out.indicator" & sign(snode.id)==1, + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] ), by = "cid") @@ -926,13 +891,13 @@ balance = function(gg, .(value = 1, id, cid = paste("rhee", c4))], ## loose indicators - vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(s), + vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE, .(value = 1, id, cid = paste("rhee", s))], - vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(c), + vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE, .(value = 1, id, cid = paste("rhee", c))], - vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(s), + vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE, .(value = 1, id, cid = paste("rhee", s))], - vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c), + vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE, .(value = 1, id, cid = paste("rhee", c))] ) From c82a4e0a8df4a8b114e82286b9ab25146607af12 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 17 May 2021 10:33:36 -0400 Subject: [PATCH 12/35] add flag to allow cnloh in phased balance --- R/apps.R | 101 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/R/apps.R b/R/apps.R index 6289c1fe..b7e78bf8 100644 --- a/R/apps.R +++ b/R/apps.R @@ -55,6 +55,7 @@ #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE #' @param ism (logical) additional ISM constraints (FALSE) +#' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE. #' @param lp (logical) solve as linear program using abs value (default TRUE) #' @param M (numeric) big M constraint for L0 norm loose end penalty (default 1e3) #' @param verbose (integer)scalar specifying whether to do verbose output, value 2 will spit out MIP (1) @@ -78,6 +79,7 @@ balance = function(gg, M = 1e3, phased = FALSE, ism = FALSE, + cnloh = FALSE, lp = TRUE, verbose = 1, tilim = 10, @@ -958,47 +960,72 @@ balance = function(gg, ## REF edge configuration constraint (added by default basically) ## only add this if there are no unphased nodes - iconstraints.from = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = from, ## this is actually a misleading name because from is the row in gg$dt - cid = paste("ref.configuration.constraint.from", from))], - by = "edge.id" - ) + if (cnloh) { + + ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 + iconstraints = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF", + .(value = 1, id, edge.id = abs(sedge.id), + cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + by = "edge.id" + ) - iconstraints.to = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = to, - cid = paste("ref.configuration.constraint.to", to))], - by = "edge.id" - ) + rhs = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF", + .(value = 2, sense = "L", + cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + by = "cid" + ) - iconstraints = rbind(iconstraints.from, iconstraints.to) + constraints = rbind(constraints, + iconstraints[, .(value, id, cid)], + fill = TRUE) + b = rbind(b, rhs, fill = TRUE) - ## sum to at most 1 if phased, unconstrained if unphased - iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] - - edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), - .(value = 1, sense = "L", cid)], - by = "cid") -## rbind( -## unique(iconstraints[allele %in% c("major", "minor"), -## .(value = 1, sense = "L", cid)], by = "cid"), -## unique(iconstraints[!(allele %in% c("major", "minor")), -## .(value = 2, sense = "L", cid)], by = "cid") -## ) + } else { + + iconstraints.from = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF", + .(value = 1, id, + edge.id = abs(sedge.id), + snode.id = from, ## this is actually a misleading name because from is the row in gg$dt + cid = paste("ref.configuration.constraint.from", from))], + by = "edge.id" + ) - constraints = rbind( - constraints, - iconstraints[allele %in% c("major", "minor"), - .(value, id, cid)], - fill = TRUE) - - ## add to b - b = rbind(b, edge.indicator.b, fill = TRUE) + iconstraints.to = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF", + .(value = 1, id, + edge.id = abs(sedge.id), + snode.id = to, + cid = paste("ref.configuration.constraint.to", to))], + by = "edge.id" + ) + + iconstraints = rbind(iconstraints.from, iconstraints.to) + + ## sum to at most 1 if phased, unconstrained if unphased + iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] + + edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), + .(value = 1, sense = "L", cid)], + by = "cid") + ## rbind( + ## unique(iconstraints[allele %in% c("major", "minor"), + ## .(value = 1, sense = "L", cid)], by = "cid"), + ## unique(iconstraints[!(allele %in% c("major", "minor")), + ## .(value = 2, sense = "L", cid)], by = "cid") + ## ) + + constraints = rbind( + constraints, + iconstraints[allele %in% c("major", "minor"), + .(value, id, cid)], + fill = TRUE) + + ## add to b + b = rbind(b, edge.indicator.b, fill = TRUE) + } } if (L0) ## add "big M" constraints From ea54ad0f3e1f2f45c443621c67070f236118b812 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 24 May 2021 15:22:31 -0400 Subject: [PATCH 13/35] initial commit --- R/apps.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/apps.R b/R/apps.R index 7c078269..05118202 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2917,3 +2917,16 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL return(sol) } } + +#' @name parental.gg +#' @title parental.gg +#' +#' @description +#' +#' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype +#' +#' @param gg (gGraph) input gGraph +#' @param hfield (character) field of gGraph edge metadata corresponding with haplotype +#' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5 +#' @param verbose (logical) default FALSE +parental.gg = From 0dcc42a07fc11ea2b7d092b4484673c623eea642 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 24 May 2021 17:27:19 -0400 Subject: [PATCH 14/35] add working code for parental graph sim, add force flag to ALT edges in balance --- R/apps.R | 135 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 20 deletions(-) diff --git a/R/apps.R b/R/apps.R index 05118202..de369114 100644 --- a/R/apps.R +++ b/R/apps.R @@ -55,6 +55,7 @@ #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE #' @param ism (logical) additional ISM constraints (FALSE) +#' @param force.alt (logical) default true only applicable for phasing #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE. #' @param lp (logical) solve as linear program using abs value (default TRUE) #' @param M (numeric) big M constraint for L0 norm loose end penalty (default 1e3) @@ -79,6 +80,7 @@ balance = function(gg, M = 1e3, phased = FALSE, ism = FALSE, + force.alt = TRUE, cnloh = FALSE, lp = TRUE, verbose = 1, @@ -937,26 +939,28 @@ balance = function(gg, ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) ## can become infeasible ... - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", - .(value = 1, id, og.edge.id, - edge.id = abs(sedge.id), - cid = paste("edge.indicator.sum.lb", og.edge.id))], - by = "edge.id" - ) + if (force.alt) { + iconstraints = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT", + .(value = 1, id, og.edge.id, + edge.id = abs(sedge.id), + cid = paste("edge.indicator.sum.lb", og.edge.id))], + by = "edge.id" + ) - constraints = rbind( - constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) + constraints = rbind( + constraints, + iconstraints[, .(value, id, cid)], + fill = TRUE) - edge.indicator.b = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", - .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))], - by = "cid" - ) + edge.indicator.b = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT", + .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))], + by = "cid" + ) - b = rbind(b, edge.indicator.b, fill = TRUE) + b = rbind(b, edge.indicator.b, fill = TRUE) + } ## REF edge configuration constraint (added by default basically) ## only add this if there are no unphased nodes @@ -2925,8 +2929,99 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL #' #' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype #' -#' @param gg (gGraph) input gGraph -#' @param hfield (character) field of gGraph edge metadata corresponding with haplotype +#' @param gg (gGraph) input gGraph. if desired can present haplotype field on edge metadata #' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5 +#' @param fix (logical) fix marginal in balance? default TRUE #' @param verbose (logical) default FALSE -parental.gg = +#' @param epgap (numeric) default 1e-4 +#' @param tilim (numeric) default 60 +#' @param ... additional inputs to balance (e.g. epgap and whatnot) +#' +#' @return phased, balanced gGraph with og.node.id and allele annotation on nodes and og.edge.id annotation on edges +parental.gg = function(gg, + haplotype.frac = 0.5, + verbose = FALSE, + fix = 1, + epgap = 1e-4, + tilim = 60, + ...) { + + gg = gg$copy + + if (!("haplotype" %in% colnames(gg$edges$dt))) { + + if (verbose) { + message("Assigning haplotypes with fraction ", haplotype.frac) + } + + ## get number of ref and alt edges + n.alt = gg$edges$dt[type == "ALT", .N] + n.h1 = round(haplotype.frac * n.alt) + n.h2 = n.alt - n.h1 + ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE) + + ## mark haplotypes + gg$edges[type == "ALT"]$mark(haplotype = ht) + } else { + if (verbose) { + message("using pre-assigned haplotypes") + } + } + + n.og.nodes = nrow(gg$nodes$dt) + new.nodes.dt = rbind( + gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h1", seqnames, start, end)], + gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 100)], + fill = TRUE + ) + + new.nodes.dt[, node.id := 1:.N] + new.nodes.dt[, allele := "unphased"] + + new.edges.dt = rbind( + gg$edges$dt[type == "REF", .(og.edge.id = edge.id, n1, n1.side, n2, n2.side, type)], + gg$edges$dt[type == "REF", .(og.edge.id = edge.id, + n1 = n1 + n.og.nodes, n1.side, + n2 = n2 + n.og.nodes, n2.side, type)], + gg$edges$dt[type == "ALT" & haplotype == "h1", + .(og.edge.id = edge.id, + n1, n1.side, + cn, weight = 1000, + n2, n2.side, type)], + gg$edges$dt[type == "ALT" & haplotype == "h2", + .(og.edge.id = edge.id, + n1 = n1 + n.og.nodes, + n1.side, + n2 = n2 + n.og.nodes, + n2.side, + cn, weight = 1000, + type)], + fill = TRUE + ) + + new.edges.dt[, connection := "straight"] + + haplotype.gg = gG(nodes = dt2gr(new.nodes.dt), edges = new.edges.dt) + + ## grab marginals... + marginal.gr = gg$nodes$gr[, "cn"] + marginal.gr$fix = fix + + bal.gg = balance(haplotype.gg, marginal = marginal.gr, phased = TRUE, lp = TRUE, tilim = tilim, epgap = epgap, verbose = verbose, lambda = 10, ism = TRUE, force.alt = FALSE) + + ## fix allele annotations + bal.nodes.dt = bal.gg$nodes$dt + bal.nodes.dt[, which.major := .SD$haplotype[which.max(.SD$cn)], by = og.node.id] + bal.nodes.dt[, allele := ifelse(haplotype == which.major, "major", "minor")] + bal.nodes.dt[, col := ifelse(allele == "major", alpha("red", 0.5), alpha("blue", 0.5))] + + bal.gg$nodes$mark(allele = bal.nodes.dt$allele, col = bal.nodes.dt$col) + return(bal.gg) +} + + + + + + + From 640642800a800f194e87a0e49c829742599ad278 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Tue, 25 May 2021 10:18:08 -0400 Subject: [PATCH 15/35] fix balance code for edge marginals --- R/apps.R | 123 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 99 insertions(+), 24 deletions(-) diff --git a/R/apps.R b/R/apps.R index de369114..b568b893 100644 --- a/R/apps.R +++ b/R/apps.R @@ -339,13 +339,16 @@ balance = function(gg, cartesian = TRUE, all.x = TRUE)$dt ## match this back with edge id and add this to vars - vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]] + ## vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]] + vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), subject.id]] ## add weight and target total CN - emtch = match(emarginal.id, junction.map$seen.by.emarginal) - emarginal = unique( - vars[type == "edge",][, type := "emresidual"][, cn := junction.map$cn[emtch]][, weight := junction.map$weight[emtch]][, fix := junction.map$fix[emtch]], ## lol change to merge - by = "emarginal.id") - vars = rbind(vars, emarginal, emresidual, fill = TRUE) + emarginal = merge(unique( + vars[type == "edge" & !is.na(emarginal.id),][, type := "emresidual"][, .(emarginal.id, sedge.id, lb = -M, ub = M, gid, type, vtype = "C", from, to)], + by = "emarginal.id"), + junction.map[, .(subject.id, weight, cn, fix)], + by.x = "emarginal.id", + by.y = "subject.id") + vars = rbind(vars, emarginal, fill = TRUE) } if (lp) { @@ -380,8 +383,8 @@ balance = function(gg, ## add deltas for emresiduals if emarginals are supplied if (!is.null(emarginal)) { emdeltas = rbind( - vars[type == "emresidual", .(emarginal.id, weight, type = "emdelta.plus")][, gid := emarginal.id], - vars[type == "emresidual", .(emarginal.id, weight, type = "emdelta.minus")][, gid := emarginal.id] + vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.plus")][, gid := emarginal.id], + vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.minus")][, gid := emarginal.id] ) vars = rbind(vars, emdeltas, fill = TRUE) } @@ -2922,8 +2925,8 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL } } -#' @name parental.gg -#' @title parental.gg +#' @name parental +#' @title parental #' #' @description #' @@ -2933,18 +2936,22 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL #' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5 #' @param fix (logical) fix marginal in balance? default TRUE #' @param verbose (logical) default FALSE +#' @param lambda (numeric) default 10 +#' @param eweight (numeric) edge weight default 1e3 #' @param epgap (numeric) default 1e-4 #' @param tilim (numeric) default 60 #' @param ... additional inputs to balance (e.g. epgap and whatnot) #' #' @return phased, balanced gGraph with og.node.id and allele annotation on nodes and og.edge.id annotation on edges -parental.gg = function(gg, - haplotype.frac = 0.5, - verbose = FALSE, - fix = 1, - epgap = 1e-4, - tilim = 60, - ...) { +parental = function(gg, + haplotype.frac = 0.5, + fix = 1, + verbose = FALSE, + lambda = 10, + eweight = 1e3, + epgap = 1e-4, + tilim = 60, + ...) { gg = gg$copy @@ -2971,7 +2978,7 @@ parental.gg = function(gg, n.og.nodes = nrow(gg$nodes$dt) new.nodes.dt = rbind( gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h1", seqnames, start, end)], - gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 100)], + gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 1)], fill = TRUE ) @@ -2983,18 +2990,18 @@ parental.gg = function(gg, gg$edges$dt[type == "REF", .(og.edge.id = edge.id, n1 = n1 + n.og.nodes, n1.side, n2 = n2 + n.og.nodes, n2.side, type)], - gg$edges$dt[type == "ALT" & haplotype == "h1", + gg$edges$dt[type == "ALT", #& haplotype == "h1", .(og.edge.id = edge.id, n1, n1.side, - cn, weight = 1000, + ## cn, weight = eweight, n2, n2.side, type)], - gg$edges$dt[type == "ALT" & haplotype == "h2", + gg$edges$dt[type == "ALT", #& haplotype == "h2", .(og.edge.id = edge.id, n1 = n1 + n.og.nodes, n1.side, n2 = n2 + n.og.nodes, n2.side, - cn, weight = 1000, + ## cn, weight = eweight, type)], fill = TRUE ) @@ -3007,9 +3014,25 @@ parental.gg = function(gg, marginal.gr = gg$nodes$gr[, "cn"] marginal.gr$fix = fix - bal.gg = balance(haplotype.gg, marginal = marginal.gr, phased = TRUE, lp = TRUE, tilim = tilim, epgap = epgap, verbose = verbose, lambda = 10, ism = TRUE, force.alt = FALSE) + if (verbose) { + message("Starting balance") + } + bal.gg = balance(haplotype.gg, + marginal = marginal.gr, + emarginal = this.complex$junctions[type == "ALT"], + phased = TRUE, + lp = TRUE, + tilim = tilim, + epgap = epgap, + verbose = verbose, + lambda = lambda, + ism = TRUE, + force.alt = FALSE) ## fix allele annotations + if (verbose) { + message("Formatting output graph and relabeling alleles") + } bal.nodes.dt = bal.gg$nodes$dt bal.nodes.dt[, which.major := .SD$haplotype[which.max(.SD$cn)], by = og.node.id] bal.nodes.dt[, allele := ifelse(haplotype == which.major, "major", "minor")] @@ -3020,7 +3043,59 @@ parental.gg = function(gg, } - +#' @name simulate.hets +#' @title simulate.hets +#' +#' @description +#' +#' takes purity/ploidy +#' +#' @param gg (gGraph) phased balanced gGraph, such as from output of parental +#' @param bins (GRanges) locations of heterozygous sites with metadata columns allele and count +#' @param purity (numeric) default 1 +#' @param ploidy (numeric) default 2 +#' @param depth (numeric) (mean number of reads per site) default 50 +#' @param theta (numeric) NB parameter, positive, infinite gives Poisson, default 1. recommend on the same order of magnitude as depth. +#' +#' @return GRanges with metadata fields count and allele representing simulated read counts given supplied graph and parameters +simulate.hets = function(gg, + bins, + purity = 1, + ploidy = 2, + depth = 50, + theta = 50) { + + if (!all(c("cn", "allele", "og.node.id") %in% colnames(gg$nodes$dt))) { + stop("gg nodes missing metadata 'cn' and 'allele'") + } + if (!all(c("count", "allele") %in% names(values(bins)))) { + stop("bins missing fields 'count' and 'allele'") + } + require(MASS) + + unique.bins = unique(bins[, c()]) + unique.bins$id = 1:length(unique.bins) + + ## create data.table with absolute dosage at all snp sites + new.bins = c(unique.bins %$% gg$nodes[allele == "major"]$gr[, c("cn", "og.node.id", "allele")], + unique.bins %$% gg$nodes[allele == "minor"]$gr[, c("cn", "og.node.id", "allele")]) %>% + as.data.table + + ## calculate slope and intercept from purity, ploidy, depth + denom = 2 * (1 - purity) + purity * ploidy + beta = depth * purity / denom + gamma = depth * (1 - purity) / denom + + ## inverse rel2abs transformation + new.bins[, mu := beta * cn + gamma] + new.bins[, count := rnegbin(mu, theta = theta)] + + ## readjust so that major is always bigger than minor + new.bins[, which.major := .SD$allele[which.max(.SD$count)], by = id] + new.bins[, allele := ifelse(allele == which.major, "major", "minor")] + + return(dt2gr(new.bins[, .(seqnames, start, end, count, allele)])) +} From b72b967f1f3d69c1154bba7e8d1a3cca30824da3 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Tue, 25 May 2021 11:16:01 -0400 Subject: [PATCH 16/35] cleanup parental, expose fix.emarginal in simulate.hets --- R/apps.R | 58 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/R/apps.R b/R/apps.R index b568b893..7dc4d817 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2933,8 +2933,9 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL #' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype #' #' @param gg (gGraph) input gGraph. if desired can present haplotype field on edge metadata -#' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5 #' @param fix (logical) fix marginal in balance? default TRUE +#' @param fix.emarginal (logical) fix edge marginal? default FALSE +#' @param force.alt (logical) force incorporation of all junctions? default TRUE #' @param verbose (logical) default FALSE #' @param lambda (numeric) default 10 #' @param eweight (numeric) edge weight default 1e3 @@ -2946,6 +2947,8 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL parental = function(gg, haplotype.frac = 0.5, fix = 1, + fix.emarginal = 0, + force.alt = TRUE, verbose = FALSE, lambda = 10, eweight = 1e3, @@ -2955,25 +2958,25 @@ parental = function(gg, gg = gg$copy - if (!("haplotype" %in% colnames(gg$edges$dt))) { - - if (verbose) { - message("Assigning haplotypes with fraction ", haplotype.frac) - } - - ## get number of ref and alt edges - n.alt = gg$edges$dt[type == "ALT", .N] - n.h1 = round(haplotype.frac * n.alt) - n.h2 = n.alt - n.h1 - ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE) - - ## mark haplotypes - gg$edges[type == "ALT"]$mark(haplotype = ht) - } else { - if (verbose) { - message("using pre-assigned haplotypes") - } - } + ## if (!("haplotype" %in% colnames(gg$edges$dt))) { + + ## if (verbose) { + ## message("Assigning haplotypes with fraction ", haplotype.frac) + ## } + + ## ## get number of ref and alt edges + ## n.alt = gg$edges$dt[type == "ALT", .N] + ## n.h1 = round(haplotype.frac * n.alt) + ## n.h2 = n.alt - n.h1 + ## ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE) + + ## ## mark haplotypes + ## gg$edges[type == "ALT"]$mark(haplotype = ht) + ## } else { + ## if (verbose) { + ## message("using pre-assigned haplotypes") + ## } + ## } n.og.nodes = nrow(gg$nodes$dt) new.nodes.dt = rbind( @@ -2990,18 +2993,16 @@ parental = function(gg, gg$edges$dt[type == "REF", .(og.edge.id = edge.id, n1 = n1 + n.og.nodes, n1.side, n2 = n2 + n.og.nodes, n2.side, type)], - gg$edges$dt[type == "ALT", #& haplotype == "h1", + gg$edges$dt[type == "ALT", .(og.edge.id = edge.id, n1, n1.side, - ## cn, weight = eweight, n2, n2.side, type)], - gg$edges$dt[type == "ALT", #& haplotype == "h2", + gg$edges$dt[type == "ALT", .(og.edge.id = edge.id, n1 = n1 + n.og.nodes, n1.side, n2 = n2 + n.og.nodes, n2.side, - ## cn, weight = eweight, type)], fill = TRUE ) @@ -3014,12 +3015,17 @@ parental = function(gg, marginal.gr = gg$nodes$gr[, "cn"] marginal.gr$fix = fix + ## grab edge marginals + emarginals = this.complex$junctions[type == "ALT"] + emarginals$set(fix = fix.emarginal) + emarginals$set(weight = eweight) + if (verbose) { message("Starting balance") } bal.gg = balance(haplotype.gg, marginal = marginal.gr, - emarginal = this.complex$junctions[type == "ALT"], + emarginal = emarginals, phased = TRUE, lp = TRUE, tilim = tilim, @@ -3027,7 +3033,7 @@ parental = function(gg, verbose = verbose, lambda = lambda, ism = TRUE, - force.alt = FALSE) + force.alt = force.alt) ## fix allele annotations if (verbose) { From d0ec5a3eb3a5903c1c98aca42093242cee87a557 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Tue, 15 Jun 2021 11:36:48 -0400 Subject: [PATCH 17/35] update comments --- R/apps.R | 82 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/R/apps.R b/R/apps.R index dd21b315..ed4f700c 100644 --- a/R/apps.R +++ b/R/apps.R @@ -55,7 +55,7 @@ #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE #' @param ism (logical) additional ISM constraints (FALSE) -#' @param force.alt (logical) default true only applicable for phasing +#' @param force.alt (logical) force incorporation of ALT edges, only applicable for phasing (default TRUE) #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE. #' @param lp (logical) solve as linear program using abs value (default TRUE) #' @param M (numeric) big M constraint for L0 norm loose end penalty (default 1e3) @@ -823,30 +823,56 @@ balance = function(gg, if (phased) { ## homologous extremity exclusivity - loose.constraints = rbind( - vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, - .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], - vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, - .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] - ) - - edge.constraints = rbind( - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, - .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))], - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, - .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))] - ) - - constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) - - rhs = unique(rbind( - vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, - .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], - vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, - .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] - ), by = "cid") + ## loose.constraints = rbind( + ## vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], + ## vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] + ## ) + + ## edge.constraints = rbind( + ## vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))], + ## vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))] + ## ) + + ## constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + + ## rhs = unique(rbind( + ## vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + ## .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], + ## vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + ## .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] + ## ), by = "cid") - b = rbind(b, rhs, fill = TRUE) + ## b = rbind(b, rhs, fill = TRUE) + + ## if (verbose) { + ## message("Number of homologous extremity exclusivity constraints: ", + ## nrow(rhs)) + ## } + + ## ## grab node ids associated with ALT edges on the left + ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], + ## gg$edges$dt[n2.side == "left" & type == "ALT", n2]) + ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], + ## gg$edges$dt[n2.side == "right" & type == "ALT", n2]) + + ## ## fix loose ends for these nodes to zero + ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids), + ## ":="(lb = 0, ub = 0)] + + ## if (verbose) { + ## message("Number of homologous loose ends: ", + ## length(left.og.node.ids) + length(right.og.node.ids)) + ## } ## reciprocal homologous extremity exclusivity ## implement configuration indicators (OR constraint) @@ -919,8 +945,16 @@ balance = function(gg, .(value = 1, id, cid = paste("rhee", c))] ) + ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge) + rhomol.constraints[, n.entries := .N, by = cid] + rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)] + rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid") + if (verbose) { + message("Number of reciprocal homologous constraints: ", nrow(rhs)) + } + constraints = rbind(constraints, rhomol.constraints, fill = TRUE) b = rbind(b, rhs, fill = TRUE) From 8851640725b97d49f14f68aceb1bd43881f8977e Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Thu, 24 Jun 2021 13:31:57 -0400 Subject: [PATCH 18/35] add basic code for cnloh to phased.binstats --- R/apps.R | 506 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 329 insertions(+), 177 deletions(-) diff --git a/R/apps.R b/R/apps.R index ed4f700c..e143c116 100644 --- a/R/apps.R +++ b/R/apps.R @@ -55,6 +55,7 @@ #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE #' @param ism (logical) additional ISM constraints (FALSE) +#' @param force.major (logical) force major allele CN to be >= minor allele CN (default FALSE) #' @param force.alt (logical) force incorporation of ALT edges, only applicable for phasing (default TRUE) #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE. #' @param lp (logical) solve as linear program using abs value (default TRUE) @@ -82,6 +83,7 @@ balance = function(gg, M = 1e3, phased = FALSE, ism = FALSE, + force.major = FALSE, force.alt = TRUE, cnloh = FALSE, lp = TRUE, @@ -353,7 +355,7 @@ balance = function(gg, ## vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]] vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), subject.id]] ## add weight and target total CN - emarginal = merge(unique( + emarginal = merge.data.table(unique( vars[type == "edge" & !is.na(emarginal.id),][, type := "emresidual"][, .(emarginal.id, sedge.id, lb = -M, ub = M, gid, type, vtype = "C", from, to)], by = "emarginal.id"), junction.map[, .(subject.id, weight, cn, fix)], @@ -516,11 +518,11 @@ balance = function(gg, vars[type == "loose.out.indicator" & snode.id > 0, ":="(c = cross.sedges$sedge.id[match(n1.full, cross.sedges$n1.full)])] ## merge this info into vars - vars = merge(vars, - alt.sedges[, .(sedge.id, s1, s2, s3, s4, c1, c2, c3, c4)], - by = "sedge.id", - all.x = TRUE, - all.y = FALSE) + vars = merge.data.table(vars, + alt.sedges[, .(sedge.id, s1, s2, s3, s4, c1, c2, c3, c4)], + by = "sedge.id", + all.x = TRUE, + all.y = FALSE) } } @@ -781,98 +783,100 @@ balance = function(gg, } ## fix loose ends at zero if there's a junction there (only valid if not phasing) - if (!phased) { + #' zchoo Tuesday, Jun 15, 2021 11:53:15 AM + #' this constraint appears to be valid even if running phasing. + ## if (!phased) { ## extremity exclusivity (relevant for ALL graphs) - loose.constraints = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] - ) + loose.constraints = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) - edge.constraints = rbind( - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))], - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))] - ) + edge.constraints = rbind( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))], + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))] + ) - constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) - loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid") - edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid") - b = rbind(b, edge.b, loose.b, fill = TRUE) + b = rbind(b, edge.b, loose.b, fill = TRUE) - edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) - edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] + edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) + edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] - ## TODO: add these as ub and lb instead of equality constraints - loose.zeros = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] - ) + loose.zeros = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) - loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") + loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") - constraints = rbind(constraints, loose.zeros, fill = TRUE) - b = rbind(b, loose.zeros.rhs, fill = TRUE) - } + constraints = rbind(constraints, loose.zeros, fill = TRUE) + b = rbind(b, loose.zeros.rhs, fill = TRUE) + ## } if (phased) { ## homologous extremity exclusivity - ## loose.constraints = rbind( - ## vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, - ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], - ## vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, - ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] - ## ) - - ## edge.constraints = rbind( - ## vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, - ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))], - ## vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, - ## .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))] - ## ) - - ## constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) - - ## rhs = unique(rbind( - ## vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, - ## .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], - ## vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, - ## .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] - ## ), by = "cid") + ## this is actually redundant with previous constraints + loose.constraints = rbind( + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] + ) + + edge.constraints = rbind( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))], + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))] + ) + + constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + + rhs = unique(rbind( + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] + ), by = "cid") - ## b = rbind(b, rhs, fill = TRUE) - - ## if (verbose) { - ## message("Number of homologous extremity exclusivity constraints: ", - ## nrow(rhs)) - ## } - - ## ## grab node ids associated with ALT edges on the left - ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], - ## gg$edges$dt[n2.side == "left" & type == "ALT", n2]) - ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], - ## gg$edges$dt[n2.side == "right" & type == "ALT", n2]) - - ## ## fix loose ends for these nodes to zero - ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids), - ## ":="(lb = 0, ub = 0)] - - ## if (verbose) { - ## message("Number of homologous loose ends: ", - ## length(left.og.node.ids) + length(right.og.node.ids)) - ## } + b = rbind(b, rhs, fill = TRUE) + + if (verbose) { + message("Number of homologous extremity exclusivity constraints: ", + nrow(rhs)) + } + + ## grab node ids associated with ALT edges on the left + left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], + gg$edges$dt[n2.side == "left" & type == "ALT", n2]) + right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], + gg$edges$dt[n2.side == "right" & type == "ALT", n2]) + + ## fix loose ends for these nodes to zero + vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), + ":="(lb = 0, ub = 0)] + vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), + ":="(lb = 0, ub = 0)] + vars[type == "loose.in" & (snode.id %in% left.og.node.ids), + ":="(lb = 0, ub = 0)] + vars[type == "loose.out" & (snode.id %in% right.og.node.ids), + ":="(lb = 0, ub = 0)] + + if (verbose) { + message("Number of homologous loose ends: ", + length(left.og.node.ids) + length(right.og.node.ids)) + } ## reciprocal homologous extremity exclusivity ## implement configuration indicators (OR constraint) @@ -985,8 +989,28 @@ balance = function(gg, b = rbind(b, edge.indicator.b, fill = TRUE) + ## force major allele to have higher CN than minor allele + ## may not work for phased blocks + if (force.major) { + + iconstraints = rbind( + vars[type == "node" & allele == "major" & snode.id > 0, + .(value = 1, id, cid = paste("force.major", og.node.id))], + vars[type == "node" & allele == "minor" & snode.id > 0, + .(value = -1, id, cid = paste("force.major", og.node.id))]) + + rhs = unique(vars[type == "node" & snode.id > 0 & allele == "major", + .(value = 0, sense = "G", cid = paste("force.major", og.node.id))], + by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + } + + ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) - ## can become infeasible ... + ## can become infeasible if original graph is not compatible with ISM if (force.alt) { iconstraints = unique( vars[type == "edge.indicator" & ref.or.alt == "ALT", @@ -1015,69 +1039,82 @@ balance = function(gg, if (cnloh) { ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, edge.id = abs(sedge.id), - cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], - by = "edge.id" - ) + ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges + + if (!is.null(gg$edges$dt$cnloh)) { + cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique + if (verbose) { + message("Number of allowed CNLOH sites: ", length(cnloh.og.edges)) + } + } else { + warning("CNLOH not specified on edges. Allowing everywhere!") + cnloh.og.edges = gg$edges$dt$og.edge.id %>% unique + } + } else { - rhs = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 2, sense = "L", - cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], - by = "cid" - ) + cnloh.og.edges = c() + + } - constraints = rbind(constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) - b = rbind(b, rhs, fill = TRUE) + ## add CNLOH constraints for applicable edges + + iconstraints = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, + .(value = 1, id, edge.id = abs(sedge.id), + cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + by = "edge.id" + ) - } else { - - iconstraints.from = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = from, ## this is actually a misleading name because from is the row in gg$dt - cid = paste("ref.configuration.constraint.from", from))], - by = "edge.id" - ) + rhs = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, + .(value = 2, sense = "L", + cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + by = "cid" + ) - iconstraints.to = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = to, - cid = paste("ref.configuration.constraint.to", to))], - by = "edge.id" - ) + constraints = rbind(constraints, + iconstraints[, .(value, id, cid)], + fill = TRUE) + b = rbind(b, rhs, fill = TRUE) - iconstraints = rbind(iconstraints.from, iconstraints.to) + ## add ISM constraints for other edges + + iconstraints.from = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges), + .(value = 1, id, + edge.id = abs(sedge.id), + snode.id = from, ## this is actually a misleading name because from is the row in gg$dt + cid = paste("ref.configuration.constraint.from", from))], + by = "edge.id" + ) - ## sum to at most 1 if phased, unconstrained if unphased - iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] - - edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), - .(value = 1, sense = "L", cid)], - by = "cid") - ## rbind( - ## unique(iconstraints[allele %in% c("major", "minor"), - ## .(value = 1, sense = "L", cid)], by = "cid"), - ## unique(iconstraints[!(allele %in% c("major", "minor")), - ## .(value = 2, sense = "L", cid)], by = "cid") - ## ) + iconstraints.to = unique( + vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges), + .(value = 1, id, + edge.id = abs(sedge.id), + snode.id = to, + cid = paste("ref.configuration.constraint.to", to))], + by = "edge.id" + ) + + iconstraints = rbind(iconstraints.from, iconstraints.to) + + ## sum to at most 1 if phased, unconstrained if unphased + iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] + + edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), + .(value = 1, sense = "L", cid)], + by = "cid") + + constraints = rbind( + constraints, + iconstraints[allele %in% c("major", "minor"), + .(value, id, cid)], + fill = TRUE) + + ## add to b + b = rbind(b, edge.indicator.b, fill = TRUE) - constraints = rbind( - constraints, - iconstraints[allele %in% c("major", "minor"), - .(value, id, cid)], - fill = TRUE) - - ## add to b - b = rbind(b, edge.indicator.b, fill = TRUE) - } } if (L0) ## add "big M" constraints @@ -1229,7 +1266,7 @@ balance = function(gg, constraints, rbind( ## match up vars and marginal by snode.id and populate coefficients - merge(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))], + merge.data.table(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))], ## the residual is the difference between the sum and marginal cn vars[type == 'mresidual' & rid %in% ov$rid, .(value = -1, id, cid = paste('mresidual', rid))], fill = TRUE), @@ -1363,9 +1400,9 @@ balance = function(gg, vars$x = sol$x ## for debugging - ppc = function(x) (x %>% merge(vars, by = 'id') %>% merge(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] + ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] - ppv = function(x) {tmp = x %>% merge(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} + ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))), sense) @@ -2327,7 +2364,8 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 og.node.id, marginal.cn, allele, var, nbins, weight, index, col, cn.old, cn, fix, ywid, - old.node.id = node.id)]) %>% gr.sort + old.node.id = node.id)], + seqinfo = seqinfo(gg$nodes$gr)) %>% gr.sort ## reset edge endpoints @@ -2359,7 +2397,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 ## M = 1e3, ism = FALSE, verbose = verbose, epgap = 1e-4, ## marginal = NULL) - new.nodes.gr = inferLoose(new.nodes.gr, new.edges.dt) + new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt) postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt) postprocessed.gg$set(y.field = "cn") @@ -2379,8 +2417,10 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 #' @param purity (numeric) #' @param ploidy (numeric) #' @param count.field (str) field containing allele read counts (default count) -#' @param allele.field (str) field for containing allele label for read counts (default allele) +#' @param allele.field (str) field for containing major/minor allele label (default allele) +#' @param haplotype.field (str) field containing h1/h2 label (default haplotype) #' @param phase.blocks (GRanges) GRanges containing phase blocks (e.g. from HAPCUT). default NULL. +#' @param breaks (GRanges) extra breakpoints to introduce. default NULL. #' @param edge.phase.dt (data.table) with columns n1.major, n2.major, n1.minor, n2.minor and edge.id providing major/minor allele counts #' @param vbase.count.thres (int) number of variant base counts required to phase edges (default 5) #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9) @@ -2391,8 +2431,11 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 #' @return gGraph whose nodes are annotated with $cn.major, $cn.minor, $haplotype, and $weight fields #' @export phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, - count.field = "count", allele.field = "allele", + count.field = "count", + allele.field = "allele", + haplotype.field = "haplotype", phase.blocks = NULL, + breaks = NULL, edge.phase.dt = NULL, vbase.count.thres = 5, vbase.prop.thres = 0.9, min.bins = 3, min.var = 1e-3, @@ -2427,12 +2470,17 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, stop("phase.blocks must be GRanges") } } + if (!is.null(breaks)) { + if (!inherits(breaks, 'GRanges')) { + stop("breaks must be GRanges") + } + } if (!is.null(edge.phase.dt)) { if (!is.data.table(edge.phase.dt)) { warning("edge.phase.dt must be data.table. ignoring this input.") edge.phase.dt = NULL } - if (!all(c("edge.id", "n1.major", "n2.major", "n1.minor", "n2.minor") %in% colnames(edge.phase.dt))) { + if (!all(c("edge.id", "n1.h1", "n2.h1", "n1.h2", "n2.h2") %in% colnames(edge.phase.dt))) { warning("edge.phase.dt does not have the required columns") edge.phase.dt = NULL } @@ -2456,6 +2504,45 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, return(cn) } + if (verbose) { + message("Checking for extra breakpoints") + + if (!is.null(breaks) && length(breaks) > 0) { + ## get just the starting point and remove strand + br = gr.stripstrand(gr.start(breaks)) + + if (verbose) { + message("Number of new breaks: ", length(br)) + message("Creating new gGraph with incorporated breaks...") + } + + ## save old node IDs + old.nodes = gg$nodes$gr[, "node.id"] + old.nodes$unsplit.id = old.nodes$node.id + + ## create new gGraph with additional breakpoints + new.nodes = gr.breaks(bps = br, query = gg$nodes$gr) + gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"]) + + ## create node map to find which nodes were split up + node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"]) + + ## identify which REF edges are internal to an OG node + edge.map = gg$edges$dt + edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, unsplit.nodes$node.id)]] + edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, unsplit.nodes$node.id)]] + + internal.edges = edge.map[type == "REF" & n1.unsplit == n2.unsplit, edge.id] + + ## mark CNLOH in gg + gg$edges[internal.edges]$mark(cnloh = TRUE) + + if (verbose) { + message("Number of internal edges marked in parent graph: ", length(internal.edges)) + } + } + } + if (verbose) { message("Preparing phased gGraph nodes") } @@ -2475,13 +2562,57 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, message("Adding phase block information") } pblocks = phase.blocks[, c()] - pblocks$pblock = 1:length(pblocks) %>% as.character - og.nodes = og.nodes %$% pblocks[, "pblock"] + + ## find index of the right and left phase blocks + left.pblock = gr.match(gr.start(og.nodes), pblocks) + right.pblock = gr.match(gr.end(og.nodes), pblocks) + + ## annotate GRanges with L and R phase block indices + og.nodes$left.pblock = left.pblock + og.nodes$right.pblock = right.pblock } - phased.gg.nodes = c(og.nodes, og.nodes) - phased.gg.nodes$allele = c(rep("major", length(og.nodes)), - rep("minor", length(og.nodes))) + if (haplotype.field %in% names(values(bins))) { + if (verbose) { + message("Adding haplotype information") + } + + ## check that the entries in this field are valid + if (!all(values(bins)[[haplotype.field]] %in% c("h1", "h2"))) { + stop("values in haplotype.field must be either h1 or h2") + } + + ## identify phase block of the left side of the node + pblock.map = gr.match(pblocks, bins[values(bins)[[allele.field]] == "major"]) + pblock.major.haplotype = values(bins)[[haplotype.field]][pblock.map] + node.left.major.haplotype = pblock.major.haplotype[og.nodes$left.pblock] + node.right.major.haplotype = pblock.major.haplotype[og.nodes$right.pblock] + + major.nodes = copy(og.nodes) + major.nodes$left.haplotype = node.left.major.haplotype + major.nodes$right.haplotype = node.right.major.haplotype + minor.nodes = copy(og.nodes) + minor.nodes$left.haplotype = ifelse(node.left.major.haplotype == "h1", + "h2", + "h1") + minor.nodes$right.haplotype = ifelse(node.right.major.haplotype == "h1", + "h2", + "h1") + + ## major.map = gr.match(og.nodes, bins[values(bins)[[allele.field]] == "major"]) + ## major.nodes = copy(og.nodes) + ## major.nodes$haplotype = values(bins)[[haplotype.field]][major.map] + ## minor.nodes = copy(og.nodes) + ## minor.nodes$haplotype = ifelse(major.nodes$haplotype == "h1", "h2", "h1") + + major.nodes$allele = "major" + minor.nodes$allele = "minor" + phased.gg.nodes = c(major.nodes, minor.nodes) + } else { + phased.gg.nodes = c(og.nodes, og.nodes) + phased.gg.nodes$allele = c(rep("major", length(og.nodes)), + rep("minor", length(og.nodes))) + } if (verbose) { message("Computing allele CN") @@ -2549,50 +2680,71 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, ## add phase block information to edges (for linked reads) if (!is.null(phase.blocks)) { - phased.gg.edges[, ":="(n1.pblock = phased.gg.nodes$pblock[n1], - n2.pblock = phased.gg.nodes$pblock[n2])] + ## add haplotype information of edge endpoints + phased.gg.edges[, ":="(n1.haplotype = ifelse(n1.side == "left", + phased.gg.nodes$left.haplotype[n1], + phased.gg.nodes$right.haplotype[n1]), + n2.haplotype = ifelse(n2.side == "left", + phased.gg.nodes$left.haplotype[n2], + phased.gg.nodes$right.haplotype[n2]))] + + ## add phase block infromation of edge end points + phased.gg.edges[, ":="(n1.pblock = ifelse(n1.side == "left", + phased.gg.nodes$left.pblock[n1], + phased.gg.nodes$right.pblock[n1]), + n2.pblock = ifelse(n2.side == "left", + phased.gg.nodes$left.pblock[n2], + phased.gg.nodes$right.pblock[n2]))] + ## fix cross REF edges to zero within phase blocks - phased.gg.edges[(n1.pblock == n2.pblock) & type == "REF" & connection == "cross", - ":="(cn = 0, fix = 1)] + phased.gg.edges[(n1.pblock == n2.pblock) & type == "REF" & (n1.haplotype != n2.haplotype), + ":="(ub = 0, lb = 0)] + if (verbose) { message("Number of REF cross edges within phased blocks: ", - nrow(phased.gg.edges[(n1.pblock == n2.pblock) & - type == "REF" & - connection == "cross"])) + phased.gg.edges[(n1.pblock == n2.pblock) & + type == "REF" & + (n1.haplotype != n2.haplotype), .N]) } } + ## mark CNLOH edges + if (!is.null(gg$edges$dt$cnloh)) { + og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id] + phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE] + } + ## identify phased edges (for linked reads) if (!is.null(edge.phase.dt)) { ## compute totals - ephase = edge.phase.dt[, .(edge.id, n1.major, n2.major, n1.minor, n2.minor, - n1.total = n1.major + n1.minor, - n2.total = n2.major + n2.minor)][ + ephase = edge.phase.dt[, .(edge.id, n1.h1, n2.h1, n1.h2, n2.h2, + n1.total = n1.h1 + n1.h2, + n2.total = n2.h1 + n2.h2)][ (n1.total > vbase.count.thres) | (n2.total > vbase.count.thres)] ## count fraction of reads corresponding with each allele - ephase[, n1.major.frac := n1.major / n1.total] - ephase[, n2.major.frac := n2.major / n2.total] - ephase[, n1.minor.frac := n1.minor / n1.total] - ephase[, n2.minor.frac := n2.minor / n1.total] + ephase[, n1.h1.frac := n1.h1 / n1.total] + ephase[, n2.h1.frac := n2.h1 / n2.total] + ephase[, n1.h2.frac := n1.h2 / n1.total] + ephase[, n2.h2.frac := n2.h2 / n1.total] ## set phase if passing proportion threshold (vbase.prop.thres) - ephase[n1.major.frac > vbase.prop.thres, n1.phase := "major"] - ephase[n1.minor.frac > vbase.prop.thres, n1.phase := "minor"] - ephase[n2.major.frac > vbase.prop.thres, n2.phase := "major"] - ephase[n2.minor.frac > vbase.prop.thres, n2.phase := "minor"] + ephase[n1.h1.frac > vbase.prop.thres, n1.phase := "h1"] + ephase[n1.h2.frac > vbase.prop.thres, n1.phase := "h2"] + ephase[n2.h1.frac > vbase.prop.thres, n2.phase := "h1"] + ephase[n2.h2.frac > vbase.prop.thres, n2.phase := "h2"] ## add phase information to edges data frame phased.gg.edges[, n1.phase := ephase$n1.phase[match(og.edge.id, ephase$edge.id)]] phased.gg.edges[, n2.phase := ephase$n2.phase[match(og.edge.id, ephase$edge.id)]] ## fix things to zero - phased.gg.edges[n1.phase == "major" & n1.allele == "minor", ":="(fix = 1, cn = 0)] - phased.gg.edges[n2.phase == "major" & n2.allele == "minor", ":="(fix = 1, cn = 0)] - phased.gg.edges[n1.phase == "minor" & n1.allele == "major", ":="(fix = 1, cn = 0)] - phased.gg.edges[n2.phase == "minor" & n2.allele == "major", ":="(fix = 1, cn = 0)] + phased.gg.edges[n1.phase == "h1" & n1.haplotype == "h2", ":="(ub = 0, lb = 0)] + phased.gg.edges[n2.phase == "h1" & n2.haplotype == "h2", ":="(ub = 0, lb = 0)] + phased.gg.edges[n1.phase == "h2" & n1.haplotype == "h1", ":="(ub = 0, lb = 0)] + phased.gg.edges[n2.phase == "h2" & n2.haplotype == "h1", ":="(ub = 0, lb = 0)] if (verbose) { message("Number of ALT edges with n1 side fixed: ", sum(!is.na(phased.gg.edges$n1.phase))) From 8fa33e516662e20bf1c511aa1b9d4bc2c2fc206f Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Sat, 26 Jun 2021 10:26:15 -0400 Subject: [PATCH 19/35] fix bug in phased.binstats --- R/apps.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/R/apps.R b/R/apps.R index e143c116..5584acd4 100644 --- a/R/apps.R +++ b/R/apps.R @@ -1047,8 +1047,8 @@ balance = function(gg, message("Number of allowed CNLOH sites: ", length(cnloh.og.edges)) } } else { - warning("CNLOH not specified on edges. Allowing everywhere!") - cnloh.og.edges = gg$edges$dt$og.edge.id %>% unique + warning("CNLOH not specified on edges. Disallowing!") + cnloh.og.edges = c() } } else { @@ -2427,6 +2427,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3) #' @param min.var (numeric) min allowable variance (default 0.1) #' @param verbose (bool) default TRUE for debugging +#' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp #' @param mc.cores (int) number of cores #' @return gGraph whose nodes are annotated with $cn.major, $cn.minor, $haplotype, and $weight fields #' @export @@ -2439,7 +2440,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, edge.phase.dt = NULL, vbase.count.thres = 5, vbase.prop.thres = 0.9, min.bins = 3, min.var = 1e-3, - verbose = TRUE, mc.cores = 8) + verbose = TRUE, min.width = 1e6, mc.cores = 8) { if (verbose) { message("Checking inputs") @@ -2527,12 +2528,18 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, ## create node map to find which nodes were split up node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"]) + ## create a list of short nodes (CNLOH not allowed here!) + short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id] + ## identify which REF edges are internal to an OG node edge.map = gg$edges$dt - edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, unsplit.nodes$node.id)]] - edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, unsplit.nodes$node.id)]] + edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]] + edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]] - internal.edges = edge.map[type == "REF" & n1.unsplit == n2.unsplit, edge.id] + internal.edges = edge.map[type == "REF" & + n1.unsplit == n2.unsplit & + !(n1 %in% short.nodes) & + !(n2 %in% short.nodes), edge.id] ## mark CNLOH in gg gg$edges[internal.edges]$mark(cnloh = TRUE) From be135594c811604cae9232733feb1b899c0bb3f2 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Thu, 1 Jul 2021 11:32:39 -0400 Subject: [PATCH 20/35] update phased.binstats and balance to penalize CNLOH and reduce false positives --- R/apps.R | 96 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/R/apps.R b/R/apps.R index 5584acd4..d68aa64e 100644 --- a/R/apps.R +++ b/R/apps.R @@ -1012,8 +1012,13 @@ balance = function(gg, ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) ## can become infeasible if original graph is not compatible with ISM if (force.alt) { + + if (ism) { + warning("Forcing ALT edges while running ISM can make some problems infeasible!") + } + iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE, .(value = 1, id, og.edge.id, edge.id = abs(sedge.id), cid = paste("edge.indicator.sum.lb", og.edge.id))], @@ -1026,7 +1031,7 @@ balance = function(gg, fill = TRUE) edge.indicator.b = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT", + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE, .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))], by = "cid" ) @@ -1041,46 +1046,58 @@ balance = function(gg, ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges + ## penalize CNLOH edges + if (!is.null(gg$edges$dt$cnloh)) { cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique if (verbose) { message("Number of allowed CNLOH sites: ", length(cnloh.og.edges)) } + + ## add CNLOH annotation to variables + ## browser() + vars[, cnloh := FALSE] + vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & + ref.or.alt == "ALT" & og.edge.id %in% cnloh.og.edges, + ":="(cnloh = TRUE)] + } else { warning("CNLOH not specified on edges. Disallowing!") cnloh.og.edges = c() + vars[, cnloh := FALSE] } } else { cnloh.og.edges = c() + vars[, cnloh := FALSE] } ## add CNLOH constraints for applicable edges - iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, - .(value = 1, id, edge.id = abs(sedge.id), - cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], - by = "edge.id" - ) - - rhs = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, - .(value = 2, sense = "L", - cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], - by = "cid" - ) - - constraints = rbind(constraints, - iconstraints[, .(value, id, cid)], - fill = TRUE) - b = rbind(b, rhs, fill = TRUE) - - ## add ISM constraints for other edges + ## iconstraints = unique( + ## vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, + ## .(value = 1, id, edge.id = abs(sedge.id), + ## cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + ## by = "edge.id" + ## ) + + ## rhs = unique( + ## vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges, + ## .(value = 2, sense = "L", + ## cid = paste("ref.configuration.constraint.cnloh", og.edge.id))], + ## by = "cid" + ## ) + + ## constraints = rbind(constraints, + ## iconstraints[, .(value, id, cid)], + ## fill = TRUE) + ## b = rbind(b, rhs, fill = TRUE) + + ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT) iconstraints.from = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges), + vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), .(value = 1, id, edge.id = abs(sedge.id), snode.id = from, ## this is actually a misleading name because from is the row in gg$dt @@ -1089,7 +1106,7 @@ balance = function(gg, ) iconstraints.to = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges), + vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), .(value = 1, id, edge.id = abs(sedge.id), snode.id = to, @@ -1377,6 +1394,16 @@ balance = function(gg, Qmat = NULL ## no Q if solving LP } + ## browser() + if (cnloh) { + + if ("cnloh" %in% colnames(vars)) { + indices = which(vars$type == "edge.indicator" & !is.na(vars$cnloh) & vars$cnloh == TRUE) + cvec[indices] = lambda + } + } + + lb = vars$lb ub = vars$ub @@ -2542,11 +2569,14 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, !(n2 %in% short.nodes), edge.id] ## mark CNLOH in gg + gg$edges$mark(cnloh = FALSE) gg$edges[internal.edges]$mark(cnloh = TRUE) if (verbose) { message("Number of internal edges marked in parent graph: ", length(internal.edges)) } + } else { + gg$edges$mark(cnloh = FALSE) } } @@ -2719,9 +2749,25 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, ## mark CNLOH edges if (!is.null(gg$edges$dt$cnloh)) { og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id] - phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE] + + ## identify CNLOH cross edges and add extra edges that are ALT + phased.cnloh.edges = phased.gg.edges[og.edge.id %in% og.cnloh.edges & (n1.allele != n2.allele),] + phased.cnloh.edges[, ":="(cnloh = TRUE, type = "ALT", class = "CNLOH")] + + ## phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE] + phased.gg.edges = rbind(phased.gg.edges, phased.cnloh.edges, fill = TRUE) + + if (verbose) { + message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N]) + } } + ## OLD: CNLOH was REF + ## if (!is.null(gg$edges$dt$cnloh)) { + ## og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id] + ## phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE] + ## } + ## identify phased edges (for linked reads) if (!is.null(edge.phase.dt)) { From 7c356adfab6d12fe1131e45f05ce6d548c7e6dd1 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 5 Jul 2021 15:16:23 -0400 Subject: [PATCH 21/35] update phased.postprocess to disjoin gGraph before compressing allelic balance nodes --- R/apps.R | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 11 deletions(-) diff --git a/R/apps.R b/R/apps.R index d68aa64e..715f0ad6 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2323,11 +2323,116 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 } gg = gg$copy + ## identify unphased nodes to disjoin against + ## browser() + + if (verbose) { + message("Disjoining input graph against unphased node GRanges") + } + + seed.dt = merge.data.table(gg$nodes$dt[allele == "major", .(seqnames, start, end, width, + major.cn = cn, og.node.id, + major.loose.left = loose.left, + major.loose.right = loose.right, + major.node.id = node.id)], + gg$nodes$dt[allele == "minor", .(og.node.id, minor.cn = cn, + minor.loose.left = loose.left, + minor.loose.right = loose.right, + minor.node.id = node.id)], + by = "og.node.id") + + ## check whether there is an allelic CN change on either the left or right + ## sort nodes by start and end. nodes should be sorted, but just in case + seed.dt = seed.dt %>% split(seed.dt$seqnames) %>% lapply(function(dt) {dt[order(start),]}) %>% rbindlist + + ## check for major or minor allele CN change on the left/right + seed.dt[, major.prev := data.table::shift(major.cn, n = 1, type = "lag")] + seed.dt[, major.next := data.table::shift(major.cn, n = 1, type = "lead")] + seed.dt[, minor.prev := data.table::shift(minor.cn, n = 1, type = "lag")] + seed.dt[, minor.next := data.table::shift(minor.cn, n = 1, type = "lead")] + + seed.dt[, major.left.cn := major.cn != major.prev] + seed.dt[, major.right.cn := major.cn != major.next] + seed.dt[, minor.left.cn := minor.cn != minor.prev] + seed.dt[, minor.right.cn := minor.cn != minor.next] + + ## drop nodes with CN imbalance + seed.dt = seed.dt[major.cn == minor.cn,] + + ## identify whether the major or minor node is joined to an ALT edge with non-zero CN + nonzero.alt.left = c(gg$edges$dt[type == "ALT" & cn > 0 & n1.side == "left", n1], + gg$edges$dt[type == "ALT" & cn > 0 & n2.side == "left", n2]) + nonzero.alt.right = c(gg$edges$dt[type == "ALT" & cn > 0 & n1.side == "right", n1], + gg$edges$dt[type == "ALT" & cn > 0 & n2.side == "right", n2]) + + seed.dt[, ":="(major.alt.left = major.node.id %in% nonzero.alt.left, + major.alt.right = major.node.id %in% nonzero.alt.right, + minor.alt.left = minor.node.id %in% nonzero.alt.left, + minor.alt.right = minor.node.id %in% nonzero.alt.right)] + + ## label telomeric + sl = seqlengths(gg$nodes$gr) + seed.dt[, left.telomeric := start == 1] + seed.dt[, right.telomeric := end == sl[as.character(seqnames)]] + + ## check whether the left and right sides are EITHER loose + seed.dt[, left.alt := (major.alt.left == TRUE | minor.alt.left == TRUE | + major.loose.left == TRUE | minor.loose.left == TRUE) & + (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)] + + seed.dt[, right.alt := (major.alt.right == TRUE | minor.alt.right == TRUE | + major.loose.right == TRUE | minor.loose.right == TRUE) & + (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)] + + ## shift the end points + seed.dt[left.alt == TRUE & width > 1, start := start + 1] + seed.dt[right.alt == TRUE & width > 1, end := end - 1] + + ## create GRanges + seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)]) + + ## disjoin gGraph against this GRanges + gg = gg$disjoin(seed.gr, collapse = FALSE) + + ## any new edges introduced have to be straight + gg$edges[is.na(connection)]$mark(connection = "straight") + + ## fill in other metadata + n1 = gg$edges$dt[, n1] + n2 = gg$edges$dt[, n2] + + ## borrow CN from surrounding nodes + n1.na = gg$edges$dt[is.na(cn), n1] + gg$edges[is.na(cn)]$mark(cn = gg$nodes$dt$cn[match(n1.na, gg$nodes$dt$node.id)]) + + ## fix loose end CNs + gg = gGnome:::loosefix(gg) + + ## label n1/n2 allele and chromosome + gg$edges$mark(n1.allele = gg$nodes$dt$allele[match(n1, gg$nodes$dt$node.id)]) + gg$edges$mark(n2.allele = gg$nodes$dt$allele[match(n2, gg$nodes$dt$node.id)]) + gg$edges$mark(n1.chr = gg$nodes$dt$seqnames[match(n1, gg$nodes$dt$node.id)]) + gg$edges$mark(n2.chr = gg$nodes$dt$seqnames[match(n2, gg$nodes$dt$node.id)]) + + ## reset og.node.ids and og.edge.ids + node.id.key = gg$nodes$dt[, .(seqnames, start, end, rg = paste0(seqnames, ":", start, "-", end), node.id)] + node.id.key[, rg := as.integer(factor(rg))] + gg$nodes$mark(og.node.id = node.id.key[, rg]) + + ## reset og edge ids + edge.id.key = gg$edges$dt[, .(n1, n1.side, n2, n2.side, type)] + edge.id.key[, ":="(n1.og = node.id.key$rg[match(n1, node.id.key$node.id)], + n2.og = node.id.key$rg[match(n2, node.id.key$node.id)])] + edge.id.key[, rg := paste(n1.og, n2.og, n1.side, n2.side, type)] + edge.id.key[, rg := as.integer(factor(rg))] + gg$edges$mark(og.edge.id = edge.id.key[, rg]) + + ## identify nodes without CN imbalance if (verbose) { message("Identifying nodes without CN imbalance") } - og.node.balance = gg$nodes$dt[, .(og.node.id, allele, cn)] %>% + og.node.balance = gg$nodes$dt[width > 1, .(og.node.id, allele, cn)] %>% ## filter by width to keep 1bp stubs dcast.data.table(og.node.id ~ allele, value.var = "cn") og.node.balance[, cn.imbalance := (major != minor)] @@ -2387,12 +2492,14 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)] ## get nodes as GRanges + new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>% + lapply(function(dt) {dt[order(start),]}) %>% rbindlist new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end, og.node.id, marginal.cn, allele, var, nbins, weight, index, col, cn.old, cn, fix, ywid, old.node.id = node.id)], - seqinfo = seqinfo(gg$nodes$gr)) %>% gr.sort + seqinfo = seqinfo(gg$nodes$gr)) ## reset edge endpoints @@ -2405,24 +2512,24 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)] new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)] - new.edges.dt = new.edges.dt[cn > 0,] + ## label REF edges as straight or cross based on og.node.id + ## browser() + ## new.edges.dt[type == "REF" & cn > 0, length(unique(connection)), by = og.edge.id] %>% summary + new.edges.dt[type == "REF", orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id] + + ## only keep REF edges in the correct orientation (regardless of CN) + ## only keep ALT edges with CN > 0 + new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) | + (type == "ALT" & cn > 0),] ## remove edge CN and fix if ("fix" %in% colnames(new.edges.dt)) { new.edges.dt$fix = NULL } - ## if ("cn" %in% colnames(new.edges.dt)) { - ## new.edges.dt$cn = NULL - ## } - - if (verbose) { message("Creating new gGraph") } - ## postprocessed.gg = balance(gG(nodes = new.nodes.gr, edges = new.edges.dt), - ## M = 1e3, ism = FALSE, verbose = verbose, epgap = 1e-4, - ## marginal = NULL) new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt) From be1d2f328badaafaad93d9366bad2a499617edc6 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Wed, 14 Jul 2021 14:11:29 -0400 Subject: [PATCH 22/35] cnloh detection update --- R/apps.R | 112 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 45 deletions(-) diff --git a/R/apps.R b/R/apps.R index 715f0ad6..e51f38ba 100644 --- a/R/apps.R +++ b/R/apps.R @@ -1049,16 +1049,16 @@ balance = function(gg, ## penalize CNLOH edges if (!is.null(gg$edges$dt$cnloh)) { - cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique + cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique if (verbose) { - message("Number of allowed CNLOH sites: ", length(cnloh.og.edges)) + message("Number of marked CNLOH edges: ", length(cnloh.edges)) } ## add CNLOH annotation to variables ## browser() vars[, cnloh := FALSE] vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & - ref.or.alt == "ALT" & og.edge.id %in% cnloh.og.edges, + ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges), ":="(cnloh = TRUE)] } else { @@ -1400,6 +1400,8 @@ balance = function(gg, if ("cnloh" %in% colnames(vars)) { indices = which(vars$type == "edge.indicator" & !is.na(vars$cnloh) & vars$cnloh == TRUE) cvec[indices] = lambda + + message("Number of penalized CNLOH edges: ", length(indices)) } } @@ -2378,11 +2380,13 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 ## check whether the left and right sides are EITHER loose seed.dt[, left.alt := (major.alt.left == TRUE | minor.alt.left == TRUE | major.loose.left == TRUE | minor.loose.left == TRUE) & - (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)] + (left.telomeric == FALSE)] + ## (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)] seed.dt[, right.alt := (major.alt.right == TRUE | minor.alt.right == TRUE | major.loose.right == TRUE | minor.loose.right == TRUE) & - (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)] + (right.telomeric == FALSE)] + ## (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)] ## shift the end points seed.dt[left.alt == TRUE & width > 1, start := start + 1] @@ -2641,50 +2645,64 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, if (verbose) { message("Checking for extra breakpoints") + } - if (!is.null(breaks) && length(breaks) > 0) { - ## get just the starting point and remove strand - br = gr.stripstrand(gr.start(breaks)) + if (!is.null(breaks) && length(breaks) > 0) { + ## get just the starting point and remove strand + br = gr.stripstrand(gr.start(breaks)) - if (verbose) { - message("Number of new breaks: ", length(br)) - message("Creating new gGraph with incorporated breaks...") - } + if (verbose) { + message("Number of new breaks: ", length(br)) + message("Creating new gGraph with incorporated breaks...") + } - ## save old node IDs - old.nodes = gg$nodes$gr[, "node.id"] - old.nodes$unsplit.id = old.nodes$node.id + ## save old node IDs + old.nodes = gg$nodes$gr[, "node.id"] + old.nodes$unsplit.id = old.nodes$node.id - ## create new gGraph with additional breakpoints - new.nodes = gr.breaks(bps = br, query = gg$nodes$gr) - gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"]) + ## create new gGraph with additional breakpoints + new.nodes = gr.breaks(bps = br, query = gg$nodes$gr) + gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"]) - ## create node map to find which nodes were split up - node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"]) + ## create node map to find which nodes were split up + node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"]) - ## create a list of short nodes (CNLOH not allowed here!) - short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id] + ## create a list of short nodes (CNLOH not allowed here!) + short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id] - ## identify which REF edges are internal to an OG node - edge.map = gg$edges$dt - edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]] - edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]] + ## identify which REF edges are internal to an OG node + edge.map = gg$edges$dt + edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]] + edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]] - internal.edges = edge.map[type == "REF" & - n1.unsplit == n2.unsplit & - !(n1 %in% short.nodes) & - !(n2 %in% short.nodes), edge.id] + internal.edges = edge.map[type == "REF" & + n1.unsplit == n2.unsplit & + !(n1 %in% short.nodes) & + !(n2 %in% short.nodes), edge.id] - ## mark CNLOH in gg - gg$edges$mark(cnloh = FALSE) - gg$edges[internal.edges]$mark(cnloh = TRUE) + ## mark CNLOH in gg + gg$edges$mark(cnloh = FALSE) + gg$edges[internal.edges]$mark(cnloh = TRUE) - if (verbose) { - message("Number of internal edges marked in parent graph: ", length(internal.edges)) - } - } else { - gg$edges$mark(cnloh = FALSE) + if (verbose) { + message("Number of internal edges marked in parent graph: ", length(internal.edges)) } + } else { + gg$edges$mark(cnloh = FALSE) + } + + if (verbose) { + message("Marking pseudo-CNLOH") + } + ## mark pseudo-CNLOH? + pseudo.cnloh = gg$edges$dt[, .(edge.id, type)] + pseudo.cnloh[, span := gg$junctions$span] + + pseudo.cnloh.edges = pseudo.cnloh[span < 1e6 & type == "ALT", edge.id] + gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE) + + if (verbose) { + message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges)) } if (verbose) { @@ -2855,7 +2873,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, ## mark CNLOH edges if (!is.null(gg$edges$dt$cnloh)) { - og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id] + og.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "REF", edge.id] ## identify CNLOH cross edges and add extra edges that are ALT phased.cnloh.edges = phased.gg.edges[og.edge.id %in% og.cnloh.edges & (n1.allele != n2.allele),] @@ -2867,13 +2885,17 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, if (verbose) { message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N]) } - } - ## OLD: CNLOH was REF - ## if (!is.null(gg$edges$dt$cnloh)) { - ## og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id] - ## phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE] - ## } + ## mark pseudo-cnloh edges in child graph + og.pseudo.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] + phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), cnloh := TRUE] + + if (verbose) { + message("Number of pseudo-CNLOH ALT edges marked: ", + phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), .N]) + } + + } ## identify phased edges (for linked reads) if (!is.null(edge.phase.dt)) { From d47a83d99421ed64742e514f52b8fe91835a45e5 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Tue, 20 Jul 2021 10:14:08 -0400 Subject: [PATCH 23/35] fix no ALT edges CNLOH bug --- R/apps.R | 51 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/R/apps.R b/R/apps.R index f98dde42..b1db52f9 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2694,6 +2694,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1 #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9) #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3) #' @param min.var (numeric) min allowable variance (default 0.1) +#' @param max.span (numeric) max span before penalizing CNLOH #' @param verbose (bool) default TRUE for debugging #' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp #' @param mc.cores (int) number of cores @@ -2708,6 +2709,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, edge.phase.dt = NULL, vbase.count.thres = 5, vbase.prop.thres = 0.9, min.bins = 3, min.var = 1e-3, + max.span = 1e6, verbose = TRUE, min.width = 1e6, mc.cores = 8) { if (verbose) { @@ -2824,17 +2826,50 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, if (verbose) { message("Marking pseudo-CNLOH") } - ## mark pseudo-CNLOH? - pseudo.cnloh = gg$edges$dt[, .(edge.id, type)] - pseudo.cnloh[, span := gg$junctions$span] - pseudo.cnloh.edges = pseudo.cnloh[span < 1e6 & type == "ALT", edge.id] - gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE) + ## pull intra-chromosomal ALT edges, because these are cnloh candidates + pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")] + pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt + + if (nrow(pseudo.cnloh.junctions.dt)) { + + ## grab span - limit to below max.span + pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span] + + ## compute node overlap with shadow + node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr + pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap] + + ## mark candidates + pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & + class == "DEL-like" & + node.overlap.count <= 3, edge.id], + pseudo.cnloh.junctions.dt[span < max.span & + class == "INV-like" & + node.overlap.count <= 2, edge.id], + pseudo.cnloh.junctions.dt[span < max.span & + class == "DUP-like" & + node.overlap.count <= 1, edge.id]) + + ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id] + gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE) + + if (verbose) { + message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges)) + } + + } else { + + gg$edges$mark(cnloh = FALSE) + + if (verbose) { + message("No pseudo-CNLOH edges detected.") + } - if (verbose) { - message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges)) } + + if (verbose) { message("Preparing phased gGraph nodes") } @@ -3016,7 +3051,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N]) } - ## mark pseudo-cnloh edges in child graph + ## mark pseudo-cnloh edges in child graph and make sure these are all zero og.pseudo.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), cnloh := TRUE] From 0122c0f6abb50ededde4da78cb8831e02b89dea7 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 26 Jul 2021 11:03:45 -0400 Subject: [PATCH 24/35] start new postprocessing fx --- R/apps.R | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/R/apps.R b/R/apps.R index bccfd51f..cf13c61b 100644 --- a/R/apps.R +++ b/R/apps.R @@ -964,8 +964,9 @@ balance = function(gg, ) ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge) - rhomol.constraints[, n.entries := .N, by = cid] - rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)] + ## rhomol.constraints[, n.entries := .N, by = cid] + ## remove this filter! due to some loose end violations! + ## rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)] rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid") @@ -2423,6 +2424,40 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, return(gg) } +#' @name find_na_ranges +#' @title find_na_ranges +#' +#' @description +#' +#' Identify NA ranges in a phased gGraph that actually cannot be phased +#' +#' @param gg (gGraph) junction-balanced allelic graph +#' @param min.bins (numeric) default 1 +#' @param verbose (logical) default FALSE +#' +#' @return GRanges representing NA ranges that phased graph can be disjoined against in phased.postprocess +find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { + + if (!inherits(gg, "gGraph")) { + stop("gg is not gGraph") + } + + if (is.null(gg$nodes$dt$nbins)) { + stop("gg nodes must have metadata $nbins") + } + + gg.nodes.gr = gg$nodes$gr[, c("nbins", "node.id")] + gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins + + ## grab left and right endpoints + browser() + na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE) + na.nodes.gr = gr.reduce(gg.nodes.gr, by = "na.node") + + +} + + #' @name phased.postprocess #' @title phased.postprocess #' @description @@ -2430,13 +2465,16 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, #' Postprocess junction-balanced phased graph and creates unphased regions #' This identifies regions without allelic CN imbalance #' -#' @param gg junction-balanced phased gGraph. each node must have associated og node.id +#' @param gg junction-balanced phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn +#' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1 #' @param phase.blocks (GRanges) granges of phase blocks from linked reads. default = NULL #' @param mc.cores (int) number of cores. default = 8. #' @param verbose (bool) verbose > 0 prints stuff. default 1. #' +#' @return balanced gGraph with unphased nodes marked and compressed +#' #' @export -phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1) +phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = 8, verbose = 1) { ## check that gg nodes and edges have og node if (!("og.node.id" %in% colnames(gg$nodes$dt)) | !("og.edge.id" %in% colnames(gg$edges$dt))) { From ca025417bdcb38b6ee9f82d6f7e7d84c955ce839 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 26 Jul 2021 17:03:57 -0400 Subject: [PATCH 25/35] bug fixes in postprocess --- R/apps.R | 144 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 118 insertions(+), 26 deletions(-) diff --git a/R/apps.R b/R/apps.R index cf13c61b..1c249684 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2446,18 +2446,97 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { stop("gg nodes must have metadata $nbins") } - gg.nodes.gr = gg$nodes$gr[, c("nbins", "node.id")] + gg.nodes.gr = gg$nodes$gr[, c("nbins")] gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins - ## grab left and right endpoints - browser() + ## reduce NA nodes na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE) - na.nodes.gr = gr.reduce(gg.nodes.gr, by = "na.node") + ## if there are magically not any of these, return + if (length(na.nodes.gr) == 0) { + return(GRanges()) + } + + na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node") + + ## grab left and right endpoints (use OG node id) + na.nodes.gr$left.node.id = gr.findoverlaps(gr.start(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id + na.nodes.gr$right.node.id = gr.findoverlaps(gr.end(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id + + ## check if these ranges are flanked by a rearrangement + og.loose.dt = gg$nodes$dt[, .(loose.left = any(loose.left == TRUE, na.rm = TRUE), loose.right = any(loose.right == TRUE, na.rm = TRUE)), by = og.node.id] + og.alt.dt = rbind(gg$edges$dt[type == "ALT", .(og.node.id = gg$nodes$dt$og.node.id[n1], side = n1.side)], gg$edges$dt[type == "ALT", .(og.node.id = gg$nodes$dt$og.node.id[n2], side = n2.side)])[, .(alt.left = any(side == "left", na.rm = TRUE), alt.right = any(side == "right", na.rm = TRUE)), by = og.node.id] + + ## annotate loose and ALTs + na.nodes.gr$loose.left = na.nodes.gr$left.node.id %in% og.loose.dt[loose.left == TRUE, og.node.id] + na.nodes.gr$loose.right = na.nodes.gr$right.node.id %in% og.loose.dt[loose.right == TRUE, og.node.id] + na.nodes.gr$alt.left = na.nodes.gr$left.node.id %in% og.alt.dt[alt.left == TRUE, og.node.id] + na.nodes.gr$alt.right = na.nodes.gr$right.node.id %in% og.alt.dt[alt.right == TRUE, og.node.id] + + ## next check if ranges are neighboring a rearrangement + ## create a data table with left and right neighbor of every og node + neighbors.dt = unique(gg$edges$dt[type == "REF", .(left.neighbor = gg$nodes$dt$og.node.id[n1], right.neighbor = gg$nodes$dt$og.node.id[n2])], by = "left.neighbor") + + ## annotate with og.node.id of left and right neighbor + na.nodes.gr$left.neighbor = neighbors.dt$left.neighbor[match(na.nodes.gr$left.node.id, neighbors.dt$right.neighbor)] + na.nodes.gr$right.neighbor = neighbors.dt$right.neighbor[match(na.nodes.gr$right.node.id, neighbors.dt$left.neighbor)] + + ## check if the neighbors of each node are flanked by a loose end + na.nodes.gr$loose.left.neighbor = na.nodes.gr$left.neighbor %in% og.loose.dt[loose.right == TRUE, og.node.id] + na.nodes.gr$loose.right.neighbor = na.nodes.gr$right.neighbor %in% og.loose.dt[loose.left == TRUE, og.node.id] + + ## check if the neighbors of each node are flanked by an ALT edge + na.nodes.gr$alt.left.neighbor = na.nodes.gr$left.neighbor %in% og.alt.dt[alt.right == TRUE, og.node.id] + na.nodes.gr$alt.right.neighbor = na.nodes.gr$right.neighbor %in% og.alt.dt[alt.left == TRUE, og.node.id] + + ## get marginal copy number of left and right neighbor + marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)] + na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] + na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] + na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] + na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] + + ## get major and marginal copy number of left and right endpoints + na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] + na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] + na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] + na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] + + ## easier manipulation + na.nodes.dt = as.data.table(na.nodes.gr) + + ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal) + na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),] + + if (!nrow(na.nodes.dt)) { + return(GRanges()) + } + + ## remove LOH ranges + na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) | + (left.marginal.cn != left.major.cn) | + (right.neighbor.marginal.cn != right.neighbor.major.cn) | + (right.marginal.cn != right.major.cn),] + if (!nrow(na.nodes.dt)) { + return(GRanges()) + } + + ## check if left/right are telomeric + na.nodes.dt[, left.telomeric := (start == 1)] + na.nodes.dt[, sl := seqlengths(gg$nodes$gr)[match(seqnames, names(seqlengths(gg$nodes$gr)))]] + na.nodes.dt[, right.telomeric := (end == sl)] + + ## resize + na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE), end := end + 1] + na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE), start := start - 1] + + ## filter by major CN (e.g. if left and right neighbors have LOH) + return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr))) } + #' @name phased.postprocess #' @title phased.postprocess #' @description @@ -2559,10 +2638,17 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = seed.dt[right.alt == TRUE & width > 1, end := end - 1] ## create GRanges - seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)]) + seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr), seqinfo = seqinfo(gg$nodes$gr)) + + ## merge with NA ranges + if (verbose) { + message("Identifying NA ranges") + } + na.gr = find_na_ranges(gg, min.bins = min.bins) + all.seed.gr = gr.reduce(c(seed.gr, na.gr)) ## disjoin gGraph against this GRanges - gg = gg$disjoin(seed.gr, collapse = FALSE) + gg = gg$disjoin(all.seed.gr, collapse = FALSE) ## any new edges introduced have to be straight gg$edges[is.na(connection)]$mark(connection = "straight") @@ -2602,13 +2688,24 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = if (verbose) { message("Identifying nodes without CN imbalance") } - og.node.balance = gg$nodes$dt[width > 1, .(og.node.id, allele, cn)] %>% ## filter by width to keep 1bp stubs - dcast.data.table(og.node.id ~ allele, value.var = "cn") + ## browser() + og.node.balance = dcast.data.table(gg$nodes$dt[, .(og.node.id, allele, cn)], og.node.id ~ allele, value.var = "cn") %>% merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE) og.node.balance[, cn.imbalance := (major != minor)] og.node.balance[, cn.total := (major + minor)] - og.node.balance[, phased := ifelse(cn.imbalance == TRUE | cn.total == 0, TRUE, FALSE)] + og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)] + + if (verbose) { + message("Identifying nodes in NA stretches") + } + ## browser() + na.node.ids = as.data.table(gg$nodes$gr[, c("og.node.id", "node.id")] %&% na.gr)[, og.node.id] + + ## mark na nodes as unphased + og.node.balance[og.node.id %in% na.node.ids, phased := FALSE] + + ## annotate if (verbose) { message("Number of potentially unphased nodes: ", sum(og.node.balance$phased == FALSE)) @@ -2649,15 +2746,11 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"] ## reset CN to total CN - new.nodes.dt[node.id %in% unphased.major.nodes, - cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]] + ## browser() + new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]] ## fix the CN of all of these nodes - new.nodes.dt[, fix := 1] - - ## create new data.table for edges - new.edges.dt = gg$edges$dt[!(n1 %in% unphased.minor.nodes) | !(n2 %in% unphased.minor.nodes),] - + ## new.nodes.dt[, fix := 1] ## reformat nodes new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)] @@ -2670,17 +2763,19 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = var, nbins, weight, index, col, cn.old, cn, fix, ywid, old.node.id = node.id)], - seqinfo = seqinfo(gg$nodes$gr)) - + seqinfo = seqinfo(gg$nodes$gr), + seqlengths = seqlengths(gg$nodes$gr)) + ## create new data.table for edges + new.edges.dt = gg$edges$dt + ## reset edge endpoints - dt = gg$nodes$dt[og.node.id %in% unphased.og.nodes, .(og.node.id, allele, node.id)] %>% - dcast.data.table(og.node.id ~ allele, value.var = "node.id") + ## browser() + dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id") new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]] new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]] ## reset all edge endpoints to new node.ids - new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)] new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)] @@ -2694,17 +2789,14 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) | (type == "ALT" & cn > 0),] - ## remove edge CN and fix - if ("fix" %in% colnames(new.edges.dt)) { - new.edges.dt$fix = NULL - } + ## deduplicate edges + new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)] if (verbose) { message("Creating new gGraph") } new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt) - postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt) postprocessed.gg$set(y.field = "cn") return(postprocessed.gg) From 356b4532939ea8cc98f168b6c34091f288cfe140 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Thu, 29 Jul 2021 11:15:38 -0400 Subject: [PATCH 26/35] stash changes to apps.R --- R/apps.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/R/apps.R b/R/apps.R index 1c249684..e367fda9 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2696,6 +2696,9 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)] + ## mark these specifically as being allele-balanced + og.node.balance[, ab := phased == FALSE] + if (verbose) { message("Identifying nodes in NA stretches") } @@ -2739,14 +2742,20 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = unphased.minor.nodes = gg$nodes$dt[og.node.id %in% unphased.og.nodes & allele == "minor", node.id] unphased.major.nodes = gg$nodes$dt[og.node.id %in% unphased.og.nodes & allele == "major", node.id] + ## identify specifically allele-balanced nodes (vs. NA nodes) + ab.og.nodes = og.node.balance[ab == TRUE, og.node.id] + ab.major.nodes = gg$nodes$dt[og.node.id %in% ab.og.nodes & allele == "major", node.id] + ## create new data.table for nodes new.nodes.dt = gg$nodes$dt[!(node.id %in% unphased.minor.nodes),] ## mark major nodes as unphased new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"] + ## mark allele-balanced nodes specifically + new.nodes.dt[node.id %in% ab.major.nodes, ab := TRUE] + ## reset CN to total CN - ## browser() new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]] ## fix the CN of all of these nodes @@ -2761,7 +2770,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end, og.node.id, marginal.cn, allele, var, nbins, weight, index, col, - cn.old, cn, fix, ywid, + cn.old, cn, fix, ywid, ab, old.node.id = node.id)], seqinfo = seqinfo(gg$nodes$gr), seqlengths = seqlengths(gg$nodes$gr)) From ff49d3e92adf3097970fc17838c393cd43753b57 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Wed, 4 Aug 2021 14:32:20 -0400 Subject: [PATCH 27/35] bug fix --- R/apps.R | 29 +++++++++++++++++------------ R/eventCallers.R | 6 +++++- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/R/apps.R b/R/apps.R index 59754720..82fbdd32 100644 --- a/R/apps.R +++ b/R/apps.R @@ -935,21 +935,21 @@ balance = function(gg, cid = paste("rhee", sedge.id))], ## actual ALT edges - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s1), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s1), .(value = 1, id, cid = paste("rhee", s1))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s2), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s2), .(value = 1, id, cid = paste("rhee", s2))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s3), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s3), .(value = 1, id, cid = paste("rhee", s3))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s4), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s4), .(value = 1, id, cid = paste("rhee", s4))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c1), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c1), .(value = 1, id, cid = paste("rhee", c1))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c2), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c2), .(value = 1, id, cid = paste("rhee", c2))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c3), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c3), .(value = 1, id, cid = paste("rhee", c3))], - vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c4), + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c4), .(value = 1, id, cid = paste("rhee", c4))], ## loose indicators @@ -1422,6 +1422,11 @@ balance = function(gg, message("Number of penalized CNLOH edges: ", length(indices)) } } + + ## check constraints of CNLOH + ## browser() + ## vars[type == "edge.indicator" & cnloh == TRUE] + ## vars[type == "edge.indicator" & cnloh == TRUE, .N, by = og.edge.id] lb = vars$lb @@ -1505,16 +1510,16 @@ balance = function(gg, message("formatting phased graph...") } ## edge formatting - ref.edge.col = alpha("blue", 0.5) - alt.edge.col = alpha("red", 0.5) - ref.edge.lwd = 1.0 + ref.edge.col = alpha("blue", 0.2) + alt.edge.col = alpha("red", 0.4) + ref.edge.lwd = 0.5 alt.edge.lwd = 1.0 edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col) edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd) gg$edges$mark(col = edge.col, lwd = edge.lwd) ## mark zero cn edges - zero.cn.col = alpha("gray", 0.1) + zero.cn.col = alpha("gray", 0) zero.cn.lwd = 0.5 zero.cn.edges = which(gg$edges$dt$cn == 0) gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd) diff --git a/R/eventCallers.R b/R/eventCallers.R index 116be297..06c9fda4 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -2513,7 +2513,7 @@ amp = function(gg, jcn.thresh = 8, cn.thresh = 2, fbi.cn.thresh = 0.5, n.jun.hi ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))] keep = (gg$nodes$dt$cn/ploidy) > cn.thresh gg$clusters(keep) - if (!any(!is.na(gg$nodes$dt$cluster))) + return(gg) tiny = gg$edges$mark(tiny = gg$edges$dt$class %in% c('DEL-like', 'DUP-like') & gg$edges$span <1e4) @@ -2535,6 +2535,10 @@ amp = function(gg, jcn.thresh = 8, cn.thresh = 2, fbi.cn.thresh = 0.5, n.jun.hi n.jun = length(cl.edges), n.jun.high = sum(cl.edges$dt[, sum(cn > 3)]), max.jcn = max(c(0, cl.edges$dt$cn)), + max.loose.cn = max(c(0, + cl.nodes$dt[, loose.cn.left], + cl.nodes$dt[, loose.cn.right]), + na.rm = TRUE), max.cn = max(cl.nodes$dt$cn), footprint = paste(gr.string(cl.nodes$footprint), collapse = ",")) From e216f0c06140737c070231b0ce0415435fdb846b Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Thu, 12 Aug 2021 23:40:10 -0400 Subject: [PATCH 28/35] latest with pre-balance collapsed nodes --- R/apps.R | 316 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 257 insertions(+), 59 deletions(-) diff --git a/R/apps.R b/R/apps.R index e209c98f..10b0359e 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2438,10 +2438,11 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, #' #' @param gg (gGraph) junction-balanced allelic graph #' @param min.bins (numeric) default 1 +#' @param min.width (exclude) very short ranges (default 5e3) #' @param verbose (logical) default FALSE #' #' @return GRanges representing NA ranges that phased graph can be disjoined against in phased.postprocess -find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { +find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) { if (!inherits(gg, "gGraph")) { stop("gg is not gGraph") @@ -2452,10 +2453,11 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { } gg.nodes.gr = gg$nodes$gr[, c("nbins")] - gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins + gg.nodes.gr$na.node = (is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins) ## reduce NA nodes na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE) + ## browser() ## if there are magically not any of these, return if (length(na.nodes.gr) == 0) { @@ -2464,6 +2466,15 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node") + ## na.nodes.gr = gr.val(na.nodes.gr, target = gg$nodes$gr[, "nbins"], + ## val = "nbins", + ## weighted = FALSE, + ## FUN = sum, + ## na.rm = TRUE) + + ## only keep relatively large NA regions... for isolated small ROH we may not want to do this. + na.nodes.gr = na.nodes.gr %Q% (width(na.nodes.gr) > min.width) + ## grab left and right endpoints (use OG node id) na.nodes.gr$left.node.id = gr.findoverlaps(gr.start(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id na.nodes.gr$right.node.id = gr.findoverlaps(gr.end(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id @@ -2494,36 +2505,36 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { na.nodes.gr$alt.left.neighbor = na.nodes.gr$left.neighbor %in% og.alt.dt[alt.right == TRUE, og.node.id] na.nodes.gr$alt.right.neighbor = na.nodes.gr$right.neighbor %in% og.alt.dt[alt.left == TRUE, og.node.id] - ## get marginal copy number of left and right neighbor - marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)] - na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] - na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] - na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] - na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] + ## ## get marginal copy number of left and right neighbor + ## marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)] + ## na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)] - ## get major and marginal copy number of left and right endpoints - na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] - na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] - na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] - na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] + ## ## get major and marginal copy number of left and right endpoints + ## na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)] + ## na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)] - ## easier manipulation + ## ## easier manipulation na.nodes.dt = as.data.table(na.nodes.gr) - ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal) - #' zchoo Friday, Jul 30, 2021 11:28:55 AM - ## removed this filter - ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),] + ## ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal) + ## #' zchoo Friday, Jul 30, 2021 11:28:55 AM + ## ## removed this filter + ## ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),] - if (!nrow(na.nodes.dt)) { - return(GRanges()) - } + ## if (!nrow(na.nodes.dt)) { + ## return(GRanges()) + ## } - ## remove LOH ranges - na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) | - (left.marginal.cn != left.major.cn) | - (right.neighbor.marginal.cn != right.neighbor.major.cn) | - (right.marginal.cn != right.major.cn),] + ## ## remove LOH ranges + ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) | + ## (left.marginal.cn != left.major.cn) | + ## (right.neighbor.marginal.cn != right.neighbor.major.cn) | + ## (right.marginal.cn != right.major.cn),] if (!nrow(na.nodes.dt)) { return(GRanges()) @@ -2535,14 +2546,144 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) { na.nodes.dt[, right.telomeric := (end == sl)] ## resize - na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE), end := end + 1] - na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE), start := start - 1] + na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE), + end := end + 1] + na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE), + start := start - 1] ## filter by major CN (e.g. if left and right neighbors have LOH) return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr))) } +#' @name unphase_na_ranges +#' @title unphase_na_ranges +#' +#' @description +#' +#' collapse NA ranges before balance +#' +#' @param gg phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn, weight +#' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1 +#' @param min.wdith (numeric) +#' @param phase.blocks (GRanges) ## NOT IMPLEMENTED YET +#' @param verbose (logical) verbose > 0 prints stuff. default 1. +#' +#' @return partially phased gGraph with NA ranges collapsed, and CN fixed to marginal. +unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = NULL, verbose = 1) +{ + ## make a copy of balanced graph to prevent mutation + if (verbose) { + message("Making a copy of input gGraph") + } + gg = gg$copy + + if (verbose) { + message("Disjoining input graph against unphased node GRanges") + } + + ## check whether there is an allelic CN change on either the left or right + ## merge with NA ranges + if (verbose) { + message("Identifying NA ranges") + } + na.gr = find_na_ranges(gg, min.bins = min.bins, min.width = min.width) + + ## disjoin gGraph against this GRanges + gg = gg$disjoin(na.gr, collapse = FALSE) + + ## any new edges introduced have to be straight + gg$edges[is.na(connection)]$mark(connection = "straight") + + ## fill in other metadata + n1 = gg$edges$dt[, n1] + n2 = gg$edges$dt[, n2] + + ## label n1/n2 allele and chromosome + gg$edges$mark(n1.allele = gg$nodes$dt$allele[match(n1, gg$nodes$dt$node.id)]) + gg$edges$mark(n2.allele = gg$nodes$dt$allele[match(n2, gg$nodes$dt$node.id)]) + gg$edges$mark(n1.chr = gg$nodes$dt$seqnames[match(n1, gg$nodes$dt$node.id)]) + gg$edges$mark(n2.chr = gg$nodes$dt$seqnames[match(n2, gg$nodes$dt$node.id)]) + ## reset og.node.ids and og.edge.ids + node.id.key = gg$nodes$dt[, .(seqnames, start, end, rg = paste0(seqnames, ":", start, "-", end), node.id)] + node.id.key[, rg := as.integer(factor(rg))] + gg$nodes$mark(og.node.id = node.id.key[, rg]) + + ## reset og edge ids + edge.id.key = gg$edges$dt[, .(n1, n1.side, n2, n2.side, type)] + edge.id.key[, ":="(n1.og = node.id.key$rg[match(n1, node.id.key$node.id)], + n2.og = node.id.key$rg[match(n2, node.id.key$node.id)])] + edge.id.key[, rg := paste(n1.og, n2.og, n1.side, n2.side, type)] + edge.id.key[, rg := as.integer(factor(rg))] + gg$edges$mark(og.edge.id = edge.id.key[, rg]) + + if (verbose) { + message("Identifying nodes in NA stretches") + } + ov = gg$nodes$gr[, c("og.node.id", "node.id", "allele")] %&% na.gr + ov.dt = as.data.table(ov)[, .(node.id, allele, og.node.id)] + + ## identify corresponding minor/major nodes + unphased.major.nodes = ov.dt[allele == "major", node.id] + unphased.minor.nodes = ov.dt[allele == "minor", node.id] + + ## create new data.table for nodes + new.nodes.dt = gg$nodes$dt[!(node.id %in% unphased.minor.nodes),] + + ## mark major nodes as unphased + new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"] + + ## reset CN to total CN + new.nodes.dt[node.id %in% unphased.major.nodes, cn := marginal.cn] + new.nodes.dt[node.id %in% unphased.major.nodes, fix := 1] + + ## reformat color + new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)] + + ## get nodes as GRanges + new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>% + lapply(function(dt) {dt[order(start),]}) %>% rbindlist + new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end, + og.node.id, marginal.cn, allele, + var, nbins, weight, col, + cn, fix, old.node.id = node.id)], + seqinfo = seqinfo(gg$nodes$gr), + seqlengths = seqlengths(gg$nodes$gr)) + + ## create new data.table for edges + new.edges.dt = gg$edges$dt + + ## reset edge endpoints + ## browser() + dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id") + new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]] + new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]] + ## check if any NA? + + ## reset all edge endpoints to new node.ids + ## browser() + new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)] + new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)] + + ## deduplicate edges + ## new.edges.dt = unique(new.edges.dt[!is.na(n1) & !is.na(n2)], by = c("n1", "n1.side", "n2", "n2.side")) + new.edges.dt = rbind(new.edges.dt[cnloh == TRUE & class == "REF",], + unique(new.edges.dt[is.na(cnloh) | cnloh == FALSE | class != "REF",], + by = c("n1", "n1.side", "n2", "n2.side"))) + + ## new.edges.dt[, tmp := paste(n1, n1.side, n2, n2.side)] + ## new.edges.dt[cnloh == TRUE,] + ## new.edges.dt[tmp == "169 right 170 left",] + ## new.edges.dt[!duplicated(tmp) & cnloh == TRUE,] + ## new.edges.dt$tmp = NULL + + if (verbose) { + message("Creating new gGraph") + } + + postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt) + return(postprocessed.gg) +} #' @name phased.postprocess #' @title phased.postprocess @@ -2641,6 +2782,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = ## (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)] ## shift the end points + ## browser() seed.dt[left.alt == TRUE & width > 1, start := start + 1] seed.dt[right.alt == TRUE & width > 1, end := end - 1] @@ -2651,11 +2793,17 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = if (verbose) { message("Identifying NA ranges") } - na.gr = find_na_ranges(gg, min.bins = min.bins) - all.seed.gr = gr.reduce(c(seed.gr, na.gr)) + + if (min.bins) { + na.gr = find_na_ranges(gg, min.bins = min.bins) + all.seed.gr = gr.reduce(c(seed.gr, na.gr)) + } else { + na.gr = GRanges() + all.seed.gr = seed.gr + } ## disjoin gGraph against this GRanges - gg = gg$disjoin(all.seed.gr, collapse = FALSE) + gg = gg$copy$disjoin(all.seed.gr, collapse = FALSE) ## any new edges introduced have to be straight gg$edges[is.na(connection)]$mark(connection = "straight") @@ -2696,15 +2844,18 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = message("Identifying nodes without CN imbalance") } ## browser() - og.node.balance = dcast.data.table(gg$nodes$dt[, .(og.node.id, allele, cn)], og.node.id ~ allele, value.var = "cn") %>% merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE) + og.node.balance = dcast.data.table(gg$nodes$dt[allele %in% c("major", "minor"), + .(og.node.id, allele, cn)], + og.node.id ~ allele, value.var = "cn") %>% + merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE) og.node.balance[, cn.imbalance := (major != minor)] og.node.balance[, cn.total := (major + minor)] - og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)] + og.node.balance[, phased := ifelse(cn.imbalance == TRUE | width == 1, TRUE, FALSE)] ## mark these specifically as being allele-balanced - og.node.balance[, ab := phased == FALSE] + og.node.balance[, ab := (phased == FALSE)] if (verbose) { message("Identifying nodes in NA stretches") @@ -2742,7 +2893,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = } } - ## identify unphased og nodes + ## identify unphased og nodes that are NEW (ignore previously marked as unphased) unphased.og.nodes = og.node.balance[phased == FALSE, og.node.id] ## identify corresponding minor/major nodes @@ -2763,10 +2914,8 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = new.nodes.dt[node.id %in% ab.major.nodes, ab := TRUE] ## reset CN to total CN - new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]] - - ## fix the CN of all of these nodes - ## new.nodes.dt[, fix := 1] + new.nodes.dt[node.id %in% unphased.major.nodes, + cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]] ## reformat nodes new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)] @@ -2774,20 +2923,20 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = ## get nodes as GRanges new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>% lapply(function(dt) {dt[order(start),]}) %>% rbindlist - new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end, - og.node.id, marginal.cn, allele, - var, nbins, weight, index, col, - cn.old, cn, fix, ywid, ab, - old.node.id = node.id)], - seqinfo = seqinfo(gg$nodes$gr), - seqlengths = seqlengths(gg$nodes$gr)) + sel.cols = intersect(colnames(new.nodes.dt), + c("seqnames", "start", "end", "og.node.id", "marginal.cn", "allele", + "var", "nbins", "index", "col", "cn", "cn.old", "fix", "ywid", "ab", + "node.id")) + new.nodes.dt = new.nodes.dt[, ..sel.cols] %>% setnames("node.id", "old.node.id") + new.nodes.gr = dt2gr(new.nodes.dt, seqinfo = seqinfo(gg$nodes$gr), seqlengths = seqlengths(gg$nodes$gr)) ## create new data.table for edges new.edges.dt = gg$edges$dt - + ## reset edge endpoints - ## browser() - dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id") + dt = gg$nodes$dt[allele %in% c("major", "minor"), .(og.node.id, allele, node.id)] %>% + dcast.data.table(og.node.id ~ allele, value.var = "node.id") + new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]] new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]] @@ -2796,17 +2945,32 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)] ## label REF edges as straight or cross based on og.node.id - ## browser() - ## new.edges.dt[type == "REF" & cn > 0, length(unique(connection)), by = og.edge.id] %>% summary - new.edges.dt[type == "REF", orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id] + new.edges.dt[type == "REF" & + n1.allele %in% c("major", "minor") & + n2.allele %in% c("major", "minor"), + orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id] + + new.edges.dt[type == "REF" & + n1.allele %in% c("major", "minor") & + n2.allele %in% c("major", "minor") & + is.na(orientation), + orientation := "straight"] ## only keep REF edges in the correct orientation (regardless of CN) - ## only keep ALT edges with CN > 0 - new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) | - (type == "ALT" & cn > 0),] + new.edges.dt = new.edges.dt[(type == "REF" & + (connection == orientation | is.na(orientation))) | + (type == "ALT" & cn > 0),] ## deduplicate edges - new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)] + new.edges.dt = rbind(new.edges.dt[cnloh == TRUE & class == "REF",], + new.edges.dt[is.na(cnloh) | cnloh == FALSE | class != "REF", + .(connection = connection[1], type = type[1], + cn = sum(cn, na.rm = TRUE), + col = col[1]), + by = c("n1", "n1.side", "n2", "n2.side")], + fill = TRUE) + + ## new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)] if (verbose) { message("Creating new gGraph") @@ -2819,6 +2983,8 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = } + + #' @name phased.binstats #' @title phased.binstats #' @description @@ -2856,7 +3022,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, vbase.count.thres = 5, vbase.prop.thres = 0.9, min.bins = 3, min.var = 1e-3, max.span = 1e6, - verbose = TRUE, min.width = 1e6, mc.cores = 8) + min.width = 5e3, + fix.tiny = TRUE, + verbose = TRUE, mc.cores = 8) { if (verbose) { message("Checking inputs") @@ -2962,6 +3130,8 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, gg$edges$mark(cnloh = FALSE) gg$edges[internal.edges]$mark(cnloh = TRUE) + browser() + if (verbose) { message("Number of internal edges marked in parent graph: ", length(internal.edges)) } @@ -2983,8 +3153,8 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span] ## compute node overlap with shadow - node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr - pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap] + ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr + ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap] ## mark candidates ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & @@ -3262,6 +3432,34 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, } phased.gg = gG(nodes = phased.gg.nodes, edges = phased.gg.edges) + + if (fix.tiny) { + + if (verbose) { + message("Fixing very tiny nodes") + } + + ## browser() + ## tiny.nodes = phased.gg$nodes$dt[width < min.width, node.id] + ## tiny.edges = phased.gg$edges$dt[(n1 %in% tiny.nodes | n2 %in% tiny.nodes) & + ## type == "REF" & + ## connection == "cross", edge.id] + + ## phased.gg$edges[tiny.edges]$mark(ub = 0, lb = 0) + + tmp = phased.gg$junctions$dt[, .(edge.id, type, class, connection)] + tmp[, span := phased.gg$junctions$span] + + more.tiny.edges = tmp[class != "REF" & connection == "cross" & span < min.width, edge.id] + + phased.gg$edges[more.tiny.edges]$mark(ub = 0, lb = 0) + + if (verbose) { + ## message("Number of tiny nodes: ", length(tiny.nodes)) + message("Number of tiny edges: ", length(more.tiny.edges)) + } + } + if (verbose) { message("Formatting gGraph") From 1ed8c749e411ef173d3fa74c8f76f7306f636bb8 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Wed, 1 Sep 2021 08:54:37 -0400 Subject: [PATCH 29/35] stash new balance --- R/balance.R | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 R/balance.R diff --git a/R/balance.R b/R/balance.R new file mode 100644 index 00000000..fc132db2 --- /dev/null +++ b/R/balance.R @@ -0,0 +1,63 @@ +#' @name balance2 +#' @title balance2 +#' +#' @description +#' +#' beta reimplementation of balance +#' +#' @param gg (gGraph) with node fields +#' - $cn (numeric) CN guess +#' - $weight (numeric) +#' @param lambda +#' @param marginal +#' @param phased (logical) default FALSE +#' @param ism (logical) default TRUE +#' @param M (numeric) default 1000 +#' @param epgap (numeric) default 0.1 +#' @param trelim (numeric) max size of uncompressed tree in GB +#' @param nodefileind (numeric) one of 0 (no node file) 1 (in memory compressed) 2 (on disk uncompressed) 3 (on disk compressed) default 1 +#' @return junction-balanced gGraph +balance2 = function(gg = NULL, + lambda = 100, + marginal = NULL, + phased = FALSE, + ism = TRUE, + M = 1000, + epgap = 0.1, + trelim = 16, + nodefileind = 1) { + +} + +#' @name check_balance_inputs +#' @title check_balance_inputs +#' +#' @description +#' +#' Validate input gGraph for balance +#' +#' @param gg (gGraph) +check_balance_inputs = function(gg) { +} + +#' @name create_node_variables +#' @title create_node_variables +#' +#' @description +#' create node CN, node residual, and helper variables for nodes +#' +#' @param gg +#' @return data.table +create_node_variables = function(gg) { +} + +#' @name create_edge_variables +#' @title create_edge_variables +#' +#' @description +#' edge CN, edge residual, and helper variables for edges +#' +#' @param gg +#' @return data.table +create_edge_variables = function(gg) { +} From d3ba43ba28e6f93d50449bf8d53c07267d1409fe Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Wed, 1 Sep 2021 10:29:26 -0400 Subject: [PATCH 30/35] initial commit for phasing with haplotypes --- R/apps.R | 208 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 67 deletions(-) diff --git a/R/apps.R b/R/apps.R index 60cdf063..930197f8 100644 --- a/R/apps.R +++ b/R/apps.R @@ -377,32 +377,6 @@ balance = function(gg, ## need delta plus and delta minus for nodes and edges delta.node = gg$dt[tight == FALSE, .(gid = index, cn, weight, vtype = 'C')] delta.edge = gg$sedgesdt[, .(gid = sedge.id, cn, weight, vtype = 'C')] - - ## make sure deltas obey preset upper and lower bounds for edge id - ## if ("lb" %in% colnames(gg$dt)) { - ## delta.node[, lb := gg$dt$lb[match(gid, gg$dt$index)]] - ## } else { - ## delta.node[, lb := 0] - ## } - - ## if ("ub" %in% colnames(gg$dt)) { - ## delta.node[, ub := gg$dt$ub[match(gid, gg$dt$index)]] - ## } else { - ## delta.node[, ub := M] - ## } - - ## ## make sure deltas obey preset upper and lower bounds for edge id - ## if ("lb" %in% colnames(gg$sedgesdt)) { - ## delta.edge[, lb := gg$sedgesdt$lb[match(gid, gg$sedgesdt$sedge.id)]] - ## } else { - ## delta.edge[, lb := 0] - ## } - - ## if ("ub" %in% colnames(gg$sedgesdt)) { - ## delta.edge[, ub := gg$sedgesdt$ub[match(gid, gg$sedgesdt$sedge.id)]] - ## } else { - ## delta.edge[, ub := M] - ## } deltas = rbind( delta.node[, .(gid, weight, vtype, type = "ndelta.plus")], @@ -411,11 +385,6 @@ balance = function(gg, delta.edge[, .(gid, weight, vtype, type = "edelta.minus")] ) - ## deltas[lb < 0, lb := 0] - ## deltas[ub > M, ub := M] - ## deltas[is.na(lb), lb := 0] - ## deltas[is.na(ub), ub := M] - vars = rbind( vars, deltas, @@ -448,14 +417,42 @@ balance = function(gg, vars[, ":="(allele = gg$dt$allele[node.match], og.node.id = gg$dt$og.node.id[node.match])] + + ## add ref/alt information and og.edge.id edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id) - vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match], ## need type info but rename column... - og.edge.id = gg$sedgesdt$og.edge.id[edge.match])] + vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match], + connection = gg$sedgesdt$connection[edge.match], + og.edge.id = gg$sedgesdt$og.edge.id[edge.match], + n1 = gg$dt$snode.id[gg$sedgesdt$from[edge.match]], + n2 = gg$dt$snode.id[gg$sedgesdt$to[edge.match]])] + + vars[, n1.side := ifelse(n1 > 0, "right", "left")] + vars[, n2.side := ifelse(n2 > 0, "left", "right")] + vars[, n1 := abs(n1)] + vars[, n2 := abs(n2)] edge.indicator.vars = vars[type == "edge"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id] vars = rbind(vars, edge.indicator.vars, fill = TRUE) + + ## add node haplotype indicators + ## these are binary indicators that determine whether a node belongs to H1 + ## only constrain positive-stranded nodes due to skew symmetry + haplotype.indicators = gg$dt[allele == "major" | allele == "minor" & snode.id > 0, + .(cn, snode.id, lb, ub, weight, og.node.id, + allele, gid = index, type = 'haplotype', vtype = 'B')] + vars = rbind(vars, haplotype.indicators, fill = TRUE) + + ## add H1 and H2 'AND' indicators which should have n1/n1.side/n2/n2.side metadata + h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id] + h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id] + + vars = rbind(vars, h1.and.indicators, h2.and.indicators, fill = TRUE) + + browser() + ## check h1/h2 indicators have n1/n1.side/n2/n2.side metadata + } if (ism) { @@ -515,9 +512,6 @@ balance = function(gg, straight.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "straight.config"][, config.id := paste("straight", og.edge.id)], by = "og.edge.id") cross.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "cross.config"][, config.id := paste("cross", og.edge.id)], by = "og.edge.id") - ## add straight/cross to REF edges - vars[type == "edge.indicator" & ref.or.alt == "REF", - connection := gg$sedgesdt$connection[match(sedge.id, gg$sedgesdt$sedge.id)]] ## add config ID's to corresponding edge indicators vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, @@ -718,7 +712,85 @@ balance = function(gg, } if (phased) { - + + ## add haplotype indicator constraints + ## e.g. the haplotype indicators corresponding to the same og node must add up to 1 + iconstraints = vars[type == "haplotype" & snode.id > 0, + .(value = 1, id, cid = paste("haplotype.indicator", og.node.id))] + rhs = vars[type == "haplotype" & !duplicated(og.node.id), + .(value = 1, sense = "E", cid = paste("haplotype.indicator", og.node.id))] + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## add H1 AND constraint + h1.and.ids = merge.data.table(vars[type == "h1.and.indicator", .(n1, n2, edge.id = id, sedge.id)], + vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)], + by.x = "n1", + by.y = "n1.snode.id") %>% + merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], + by.x = "n1", + by.y = "n2.snode.id") + + h2.and.ids = merge.data.table(vars[type == "h2.and.indicator", .(n1, n2, edge.id = id, sedge.id)], + vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)], + by.x = "n1", + by.y = "n1.snode.id") %>% + merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], + by.x = "n1", + by.y = "n2.snode.id") + + ## verify only + sedge id + browser() + + ## there are four constraints that are needed to implement this first edge constraint (c1-3) + iconstraints = rbind(h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c3", sedge.id))], + h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c3", sedge.id))], + h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c3", sedge.id))]) + + rhs = rbind(h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = -1, sense = "G", cid = paste("h1.and.c3", sedge.id))]) + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + iconstraints = rbind(h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c3", sedge.id))], + h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c3", sedge.id))], + h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c3", sedge.id))]) + + rhs = rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))]) + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## verify that there are no weird NA's and that there is only one set of constraints per sedge.id + + ## connect edge indicators to the haplotype configuration of connected edges + iconstraints = rbind(vars[type == "h1.and.indicator", + .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))], + vars[type == "h2.and.indicator", + .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))], + vars[type == "edge.indicator" & + (sedge.id %in% vars[type == "h1.and.indicator",]$sedge.id), + .(value = 1, id, cid = paste("haplotype.indicator", sedge.id))]) + rhs = unique(iconstraints[, .(value = 0, sense = "L", cid)], by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## verify that there are three of these per sedge.id! + ## add constraints that force indicators to be 1 if edge CN > 0 ## add constraints for upper bound (same setup as L0 penalty) - one per edge @@ -1143,42 +1215,44 @@ balance = function(gg, ## b = rbind(b, rhs, fill = TRUE) ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT) + #' zchoo Wednesday, Sep 01, 2021 10:27:47 AM + ## this set of constraints is no longer necessary as node haplotypes have been added - iconstraints.from = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = from, ## this is actually a misleading name because from is the row in gg$dt - cid = paste("ref.configuration.constraint.from", from))], - by = "edge.id" - ) + ## iconstraints.from = unique( + ## vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), + ## .(value = 1, id, + ## edge.id = abs(sedge.id), + ## snode.id = from, ## this is actually a misleading name because from is the row in gg$dt + ## cid = paste("ref.configuration.constraint.from", from))], + ## by = "edge.id" + ## ) - iconstraints.to = unique( - vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), - .(value = 1, id, - edge.id = abs(sedge.id), - snode.id = to, - cid = paste("ref.configuration.constraint.to", to))], - by = "edge.id" - ) + ## iconstraints.to = unique( + ## vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), + ## .(value = 1, id, + ## edge.id = abs(sedge.id), + ## snode.id = to, + ## cid = paste("ref.configuration.constraint.to", to))], + ## by = "edge.id" + ## ) - iconstraints = rbind(iconstraints.from, iconstraints.to) + ## iconstraints = rbind(iconstraints.from, iconstraints.to) - ## sum to at most 1 if phased, unconstrained if unphased - iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] + ## ## sum to at most 1 if phased, unconstrained if unphased + ## iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])] - edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), - .(value = 1, sense = "L", cid)], - by = "cid") - - constraints = rbind( - constraints, - iconstraints[allele %in% c("major", "minor"), - .(value, id, cid)], - fill = TRUE) + ## edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"), + ## .(value = 1, sense = "L", cid)], + ## by = "cid") + + ## constraints = rbind( + ## constraints, + ## iconstraints[allele %in% c("major", "minor"), + ## .(value, id, cid)], + ## fill = TRUE) - ## add to b - b = rbind(b, edge.indicator.b, fill = TRUE) + ## ## add to b + ## b = rbind(b, edge.indicator.b, fill = TRUE) } From 63592c114449005d01b2c526dfed73758b52587d Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 13 Sep 2021 14:06:44 -0400 Subject: [PATCH 31/35] stash changes for phasing --- R/apps.R | 1220 +++++++++++++++++++++++++++++------------------------- 1 file changed, 659 insertions(+), 561 deletions(-) diff --git a/R/apps.R b/R/apps.R index 930197f8..a41a431d 100644 --- a/R/apps.R +++ b/R/apps.R @@ -69,8 +69,9 @@ #' @param nodefileind (numeric) one of 0 (no node file) 1 (in memory compressed) 2 (on disk uncompressed) 3 (on disk compressed) default 1 #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE) #' @param gurobi (logical) use gurobi if TRUE uses gurobi else CPLEX default FALSE -#' - +#' @param force.haplotypes (logical) default TRUE +#' @param max.span (numeric) the maximum span of an edge below which both endpoints must be on the same parental haplotype default 1e9 +#' #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda. #' @author Marcin Imielinski #' @@ -85,9 +86,9 @@ balance = function(gg, loose.collapse = FALSE, M = 1e3, phased = FALSE, - ism = FALSE, - force.major = FALSE, - force.alt = TRUE, + ism = TRUE, + force.major = TRUE, + force.alt = FALSE, cnloh = FALSE, lp = TRUE, verbose = 1, @@ -95,6 +96,7 @@ balance = function(gg, trelim = 32e3, nodefileind = 1, epgap = 1e-3, + max.span = 1e9, ## max span in bp debug = FALSE) { if (verbose) { @@ -138,7 +140,7 @@ balance = function(gg, stop("cannot run phased balance without $allele field in nodes") } } - + if (!is.null(marginal)) { if (!inherits(marginal, 'GRanges') || is.null(marginal$cn)) { stop('marginal must be a GRanges with field $cn') @@ -176,7 +178,7 @@ balance = function(gg, emarginal$set(weight = 1) } } - + ## default local lambda: default local lambda is 1 for consistency with JaBbA if (!('lambda' %in% names(gg$nodes$dt))) gg$nodes$mark(lambda = 1) @@ -199,7 +201,7 @@ balance = function(gg, { gg$edges$mark(reward = 0) } - + ## handle parsing of efix, nfix, nrelax, erelax if (!any(deparse(substitute(nfix)) == "NULL")) ## R voodo to allow "with" style evaluation nfix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nfix)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) @@ -265,78 +267,78 @@ balance = function(gg, if (is.null(gg$edges$dt$ub)) gg$edges$mark(ub = Inf) - if (loose.collapse) - { - if (verbose) - message('Collapsing loose ends') - - uleft = unique(gr.start(gg$nodes$gr)) - uright = unique(gr.end(gg$nodes$gr)) - - gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l')) - gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r')) - } - else - { - gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l')) - gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r')) - } + if (loose.collapse) + { + if (verbose) + message('Collapsing loose ends') - ######## - ## VARIABLES - ######## - - ## create state space, keeping track of graph ids - vars = rbind( - gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes - gg$sedgesdt[, .(from, to, lb, ub, sedge.id, cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges - - ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE - ## will be defined on the basis of coordinate overlap) - gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, - ulid = paste0(index, 'i'), - lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)), - type = 'loose.in', vtype = 'I')], ## incoming loose ends - gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, - ulid = paste0(index, 'o'), - lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)), - type = 'loose.out', vtype = 'I')], ## outgoing loose ends - gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual - gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual - fill = TRUE) - - if (L0) - { - ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true - ## (i.e. we need indicator.sum and indicator.sum.indicator - if (verbose) { - message("adding l0 penalty indicator") + uleft = unique(gr.start(gg$nodes$gr)) + uright = unique(gr.end(gg$nodes$gr)) + + gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l')) + gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r')) + } + else + { + gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l')) + gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r')) } - vars = rbind(vars, - rbind( - vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid], - vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid] - )) +######## + ## VARIABLES +######## + + ## create state space, keeping track of graph ids + vars = rbind( + gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes + gg$sedgesdt[, .(from, to, lb, ub, sedge.id, cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges + + ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE + ## will be defined on the basis of coordinate overlap) + gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, + ulid = paste0(index, 'i'), + lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)), + type = 'loose.in', vtype = 'I')], ## incoming loose ends + gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, + ulid = paste0(index, 'o'), + lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)), + type = 'loose.out', vtype = 'I')], ## outgoing loose ends + gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual + gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual + fill = TRUE) - if (loose.collapse) + if (L0) { - ## sum will sum all the loose ends assocaited with the same lid - vars = rbind(vars, - unique(rbind( - vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid], - vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid] - ), by = 'gid')) - - ## sum.indicator is an binary indicator on the sum - vars = rbind(vars, - rbind( - vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid], - vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid] - )) - } - } - + ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true + ## (i.e. we need indicator.sum and indicator.sum.indicator + if (verbose) { + message("adding l0 penalty indicator") + } + + vars = rbind(vars, + rbind( + vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid], + vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid] + )) + + if (loose.collapse) + { + ## sum will sum all the loose ends assocaited with the same lid + vars = rbind(vars, + unique(rbind( + vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid], + vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid] + ), by = 'gid')) + + ## sum.indicator is an binary indicator on the sum + vars = rbind(vars, + rbind( + vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid], + vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid] + )) + } + } + if (!is.null(marginal)) { ## first disjoin marginal against the nodes ## ie wee ned to create a separate residual variable for every unique @@ -410,6 +412,23 @@ balance = function(gg, } } + #' zchoo Thursday, Sep 02, 2021 11:16:57 AM + ## moved from being ISM-specific annotation as this is more generally useful information + ## add telomeric annotation + qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + term.out = -term.in ## out is reciprocal of in + + ## annotate loose indicators with this + vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | + snode.id %in% term.out, + TRUE, + FALSE)] + + + + if (phased) { ## add allele information and og.node.id @@ -425,6 +444,7 @@ balance = function(gg, vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match], connection = gg$sedgesdt$connection[edge.match], og.edge.id = gg$sedgesdt$og.edge.id[edge.match], + span = gg$sedgesdt$span[edge.match], ## NEED SPAN n1 = gg$dt$snode.id[gg$sedgesdt$from[edge.match]], n2 = gg$dt$snode.id[gg$sedgesdt$to[edge.match]])] @@ -436,35 +456,77 @@ balance = function(gg, edge.indicator.vars = vars[type == "edge"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id] vars = rbind(vars, edge.indicator.vars, fill = TRUE) + #' zchoo Thursday, Sep 02, 2021 05:30:53 PM + #' moved to earlier + ## REF edge configuration constraint (added by default basically) + ## only add this if there are no unphased nodes + if (cnloh) { + + ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 + ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges + + ## penalize CNLOH edges + + if (!is.null(gg$edges$dt$cnloh)) { + cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique + if (verbose) { + message("Number of marked CNLOH edges: ", length(cnloh.edges)) + } + + ## add CNLOH annotation to variables + ## browser() + vars[, cnloh := FALSE] + vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & + ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges), + ":="(cnloh = TRUE)] + + } else { + warning("CNLOH not specified on edges. Disallowing!") + cnloh.og.edges = c() + vars[, cnloh := FALSE] + } + } else { + + cnloh.og.edges = c() + vars[, cnloh := FALSE] + + } + ## add node haplotype indicators ## these are binary indicators that determine whether a node belongs to H1 ## only constrain positive-stranded nodes due to skew symmetry - haplotype.indicators = gg$dt[allele == "major" | allele == "minor" & snode.id > 0, + haplotype.indicators = gg$dt[(allele == "major" | allele == "minor") & snode.id > 0, .(cn, snode.id, lb, ub, weight, og.node.id, allele, gid = index, type = 'haplotype', vtype = 'B')] vars = rbind(vars, haplotype.indicators, fill = TRUE) ## add H1 and H2 'AND' indicators which should have n1/n1.side/n2/n2.side metadata - h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id] - h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id] + ## only add these for low-span edges + ## and where CNLOH is FALSE + h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id] + h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id] vars = rbind(vars, h1.and.indicators, h2.and.indicators, fill = TRUE) - browser() + ## browser() + ## h1.and.indicators + ## vars[cnloh,] ## check h1/h2 indicators have n1/n1.side/n2/n2.side metadata - + } if (ism) { - ## add telomeric annotation - qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips - term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome - -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end - term.out = -term.in ## out is reciprocal of in + #' zchoo Thursday, Sep 02, 2021 11:15:11 AM + ## moved telomeric annotation + ## ## add telomeric annotation + ## qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + ## term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + ## -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + ## term.out = -term.in ## out is reciprocal of in - ## annotate loose indicators with this - vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)] + ## ## annotate loose indicators with this + ## vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)] ## if not phased, must add edge indicators (for just the ALT edges) if (!phased) { @@ -581,7 +643,7 @@ balance = function(gg, ## vars[type %in% c('node', 'edge'), lb := ifelse(is.na(lb), 0, pmax(lb, 0, na.rm = TRUE)] ## vars[type %in% c('node', 'edge'), ub := ifelse(is.na(ub), M, pmin(ub, M, na.rm = TRUE))] vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)] - + vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)] @@ -601,135 +663,146 @@ balance = function(gg, vars[(type %in% c('loose.in', 'loose.in.indicator')) & (snode.id %in% term.in), terminal := TRUE] vars[(type %in% c('loose.out', 'loose.out.indicator')) & (snode.id %in% term.out), terminal := TRUE] - ######## - ## CONSTRAINTS - ## the key principle behind this "melted" form of constraint building is the cid - ## (constraint id) which is the key that will group coefficients into constraints - ## when we finally build the matrices. So all we need to do is make sure that - ## that value / cid pairs make sense and that every cid has an entry in b - ######## - - ## we need one junction balance constraint per loose end - - ## constraints indexed by cid - constraints = rbind( - vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('in', to))], - vars[type == 'node', .(value = -1, id, cid = paste('in', gid))], - vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('out', from))], - vars[type == 'node', .(value = -1, id, cid = paste('out', gid))], - fill = TRUE) - - b = rbind( - vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))], - vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))], - fill = TRUE) - - ## add to the constraints the definitions of the node and edge - ## residuals - constraints = rbind( - constraints, - rbind( - vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))], - vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))], - vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))], - vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))], - fill = TRUE) - ) - - b = rbind(b, - vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))], - vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))], - fill = TRUE) - - ## add the reverse complement equality constraints on nodes and edges - constraints = rbind( - constraints, - rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id) - vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))], - vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))], - fill = TRUE) - ) - - b = rbind(b, - vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))], - vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))], - fill = TRUE) + - ## if solving as LP, add deltas constraints (absolute value trick) +######## + ## CONSTRAINTS + ## the key principle behind this "melted" form of constraint building is the cid + ## (constraint id) which is the key that will group coefficients into constraints + ## when we finally build the matrices. So all we need to do is make sure that + ## that value / cid pairs make sense and that every cid has an entry in b +######## - if (lp) { - if (verbose) { - message("adding delta constraints for LP") - } + ## we need one junction balance constraint per loose end - vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)] + ## constraints indexed by cid + constraints = rbind( + vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('in', to))], + vars[type == 'node', .(value = -1, id, cid = paste('in', gid))], + vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('out', from))], + vars[type == 'node', .(value = -1, id, cid = paste('out', gid))], + fill = TRUE) - ## add the residual constraints - ndelta.slack = rbind( - vars[type == "nresidual", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], - vars[type == "ndelta.minus", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], - vars[type == "nresidual", .(value = 1, id, cid = paste("ndelta.plus.slack", gid))], - vars[type == "ndelta.plus", .(value = -1, id, cid = paste("ndelta.plus.slack", gid))] - ) + b = rbind( + vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))], + vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))], + fill = TRUE) - ndelta.slack.rhs = rbind( - vars[type == "ndelta.minus", .(value = 0, sense = "L", cid = paste("ndelta.minus.slack", gid))], - vars[type == "ndelta.plus", .(value = 0, sense = "L", cid = paste("ndelta.plus.slack", gid))] + ## add to the constraints the definitions of the node and edge + ## residuals + constraints = rbind( + constraints, + rbind( + vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))], + vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))], + vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))], + fill = TRUE) ) - edelta.slack = rbind( - vars[type == "eresidual", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], - vars[type == "edelta.minus", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], - vars[type == "eresidual", .(value = 1, id, cid = paste("edelta.plus.slack", gid))], - vars[type == "edelta.plus", .(value = -1, id, cid = paste("edelta.plus.slack", gid))] + b = rbind(b, + vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))], + vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))], + fill = TRUE) + + ## add the reverse complement equality constraints on nodes and edges + constraints = rbind( + constraints, + rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id) + vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))], + vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))], + fill = TRUE) ) + + b = rbind(b, + vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))], + vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))], + fill = TRUE) - edelta.slack.rhs = rbind( - vars[type == "edelta.minus", .(value = 0, sense = "L", cid = paste("edelta.minus.slack", gid))], - vars[type == "edelta.plus", .(value = 0, sense = "L", cid = paste("edelta.plus.slack", gid))] - ) - mdelta.slack = rbind( - vars[type == "mresidual", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], - vars[type == "mdelta.minus", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], - vars[type == "mresidual", .(value = 1, id, cid = paste("mdelta.plus.slack", gid))], - vars[type == "mdelta.plus", .(value = -1, id, cid = paste("mdelta.plus.slack", gid))] - ) + ## if solving as LP, add deltas constraints (absolute value trick) - mdelta.slack.rhs = rbind( - vars[type == "mdelta.minus", .(value = 0, sense = "L", cid = paste("mdelta.minus.slack", gid))], - vars[type == "mdelta.plus", .(value = 0, sense = "L", cid = paste("mdelta.plus.slack", gid))] - ) + if (lp) { + if (verbose) { + message("adding delta constraints for LP") + } - constraints = rbind(constraints, ndelta.slack, edelta.slack, mdelta.slack, fill = TRUE) - b = rbind(b, ndelta.slack.rhs, edelta.slack.rhs, mdelta.slack.rhs, fill = TRUE) + vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)] - ## browser() + ## add the residual constraints + ndelta.slack = rbind( + vars[type == "nresidual", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], + vars[type == "ndelta.minus", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], + vars[type == "nresidual", .(value = 1, id, cid = paste("ndelta.plus.slack", gid))], + vars[type == "ndelta.plus", .(value = -1, id, cid = paste("ndelta.plus.slack", gid))] + ) - } + ndelta.slack.rhs = rbind( + vars[type == "ndelta.minus", .(value = 0, sense = "L", cid = paste("ndelta.minus.slack", gid))], + vars[type == "ndelta.plus", .(value = 0, sense = "L", cid = paste("ndelta.plus.slack", gid))] + ) + + edelta.slack = rbind( + vars[type == "eresidual", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], + vars[type == "edelta.minus", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], + vars[type == "eresidual", .(value = 1, id, cid = paste("edelta.plus.slack", gid))], + vars[type == "edelta.plus", .(value = -1, id, cid = paste("edelta.plus.slack", gid))] + ) + + edelta.slack.rhs = rbind( + vars[type == "edelta.minus", .(value = 0, sense = "L", cid = paste("edelta.minus.slack", gid))], + vars[type == "edelta.plus", .(value = 0, sense = "L", cid = paste("edelta.plus.slack", gid))] + ) + + mdelta.slack = rbind( + vars[type == "mresidual", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], + vars[type == "mdelta.minus", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], + vars[type == "mresidual", .(value = 1, id, cid = paste("mdelta.plus.slack", gid))], + vars[type == "mdelta.plus", .(value = -1, id, cid = paste("mdelta.plus.slack", gid))] + ) + + mdelta.slack.rhs = rbind( + vars[type == "mdelta.minus", .(value = 0, sense = "L", cid = paste("mdelta.minus.slack", gid))], + vars[type == "mdelta.plus", .(value = 0, sense = "L", cid = paste("mdelta.plus.slack", gid))] + ) + + constraints = rbind(constraints, ndelta.slack, edelta.slack, mdelta.slack, fill = TRUE) + b = rbind(b, ndelta.slack.rhs, edelta.slack.rhs, mdelta.slack.rhs, fill = TRUE) + + ## browser() + + } if (phased) { ## add haplotype indicator constraints ## e.g. the haplotype indicators corresponding to the same og node must add up to 1 iconstraints = vars[type == "haplotype" & snode.id > 0, - .(value = 1, id, cid = paste("haplotype.indicator", og.node.id))] - rhs = vars[type == "haplotype" & !duplicated(og.node.id), - .(value = 1, sense = "E", cid = paste("haplotype.indicator", og.node.id))] + .(value = 1, id, cid = paste("haplotype.node", og.node.id))] + rhs = unique(vars[type == "haplotype", + .(value = 1, sense = "E", cid = paste("haplotype.node", og.node.id))], + by = "cid") constraints = rbind(constraints, iconstraints, fill = TRUE) b = rbind(b, rhs, fill = TRUE) + ## check that there are two per og edge id + ## browser() + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[, count] == 2, na.rm = TRUE) + ## length(unique(rhs[, cid])) == length(unique(iconstraints[, cid])) + ## length(unique(rhs[, cid])) == length(unique(gg$nodes$dt[allele == "major", og.node.id])) + ## add H1 AND constraint h1.and.ids = merge.data.table(vars[type == "h1.and.indicator", .(n1, n2, edge.id = id, sedge.id)], vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)], by.x = "n1", by.y = "n1.snode.id") %>% merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], - by.x = "n1", + by.x = "n2", by.y = "n2.snode.id") h2.and.ids = merge.data.table(vars[type == "h2.and.indicator", .(n1, n2, edge.id = id, sedge.id)], @@ -737,11 +810,11 @@ balance = function(gg, by.x = "n1", by.y = "n1.snode.id") %>% merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], - by.x = "n1", + by.x = "n2", by.y = "n2.snode.id") ## verify only + sedge id - browser() + ## browser() ## there are four constraints that are needed to implement this first edge constraint (c1-3) iconstraints = rbind(h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c1", sedge.id))], @@ -759,6 +832,12 @@ balance = function(gg, constraints = rbind(constraints, iconstraints, fill = TRUE) b = rbind(b, rhs, fill = TRUE) + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[cid %like% "c1", count] == 2) + ## all(tmp[cid %like% "c2", count] == 2) + ## all(tmp[cid %like% "c3", count] == 3) + + iconstraints = rbind(h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c1", sedge.id))], h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c1", sedge.id))], h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c2", sedge.id))], @@ -767,14 +846,19 @@ balance = function(gg, h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c3", sedge.id))], h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c3", sedge.id))]) - rhs = rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))], - h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))], - h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))]) + rhs = unique(rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))]), + by = "cid") constraints = rbind(constraints, iconstraints, fill = TRUE) b = rbind(b, rhs, fill = TRUE) ## verify that there are no weird NA's and that there is only one set of constraints per sedge.id + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[cid %like% "c1", count] == 2) + ## all(tmp[cid %like% "c2", count] == 2) + ## all(tmp[cid %like% "c3", count] == 3) ## connect edge indicators to the haplotype configuration of connected edges iconstraints = rbind(vars[type == "h1.and.indicator", @@ -790,7 +874,9 @@ balance = function(gg, b = rbind(b, rhs, fill = TRUE) ## verify that there are three of these per sedge.id! - + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[, count] == 3) + ## add constraints that force indicators to be 1 if edge CN > 0 ## add constraints for upper bound (same setup as L0 penalty) - one per edge @@ -905,7 +991,7 @@ balance = function(gg, #' zchoo Tuesday, Jun 15, 2021 11:53:15 AM #' this constraint appears to be valid even if running phasing. ## if (!phased) { - ## extremity exclusivity (relevant for ALL graphs) + ## extremity exclusivity (relevant for ALL graphs) loose.constraints = rbind( vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], @@ -927,25 +1013,27 @@ balance = function(gg, b = rbind(b, edge.b, loose.b, fill = TRUE) - ## edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) - ## edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] + ## edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) + ## edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] - ## loose.zeros = rbind( - ## vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, - ## .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - ## vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, - ## .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] - ## ) + ## loose.zeros = rbind( + ## vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + ## .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + ## vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + ## .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ## ) - ## loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") + ## loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") - ## constraints = rbind(constraints, loose.zeros, fill = TRUE) - ## b = rbind(b, loose.zeros.rhs, fill = TRUE) - } + ## constraints = rbind(constraints, loose.zeros, fill = TRUE) + ## b = rbind(b, loose.zeros.rhs, fill = TRUE) + } - if (phased) { - ## homologous extremity exclusivity - ## this is actually redundant with previous constraints + if (phased) { + ## homologous extremity exclusivity + ## this is actually redundant with previous constraints + + if (ism) { loose.constraints = rbind( vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], @@ -977,26 +1065,28 @@ balance = function(gg, } ## grab node ids associated with ALT edges on the left - left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], - gg$edges$dt[n2.side == "left" & type == "ALT", n2]) - right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], - gg$edges$dt[n2.side == "right" & type == "ALT", n2]) - - ## fix loose ends for these nodes to zero - vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), - ":="(lb = 0, ub = 0)] - vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), - ":="(lb = 0, ub = 0)] - vars[type == "loose.in" & (snode.id %in% left.og.node.ids), - ":="(lb = 0, ub = 0)] - vars[type == "loose.out" & (snode.id %in% right.og.node.ids), - ":="(lb = 0, ub = 0)] - - if (verbose) { - message("Number of homologous loose ends: ", - length(left.og.node.ids) + length(right.og.node.ids)) - } - + ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], + ## gg$edges$dt[n2.side == "left" & type == "ALT", n2]) + ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], + ## gg$edges$dt[n2.side == "right" & type == "ALT", n2]) + + ## ## fix loose ends for these nodes to zero + ## #' zchoo Thursday, Sep 02, 2021 11:13:44 AM + ## #' removed these constraints to see if feasibility improves + ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids), + ## ":="(lb = 0, ub = 0)] + ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids), + ## ":="(lb = 0, ub = 0)] + + ## if (verbose) { + ## message("Number of homologous loose ends: ", + ## length(left.og.node.ids) + length(right.og.node.ids)) + ## } + ## reciprocal homologous extremity exclusivity ## implement configuration indicators (OR constraint) config.dt = vars[type == "straight.config" | type == "cross.config",] @@ -1066,7 +1156,7 @@ balance = function(gg, .(value = 1, id, cid = paste("rhee", s))], vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE, .(value = 1, id, cid = paste("rhee", c))] - ) + ) ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge) ## rhomol.constraints[, n.entries := .N, by = cid] @@ -1083,9 +1173,6 @@ balance = function(gg, b = rbind(b, rhs, fill = TRUE) } - } - - if (phased) { ## add the edge indicator sum constraints (ISM consistency) iconstraints = unique( @@ -1127,7 +1214,7 @@ balance = function(gg, b = rbind(b, rhs, fill = TRUE) } - + ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) ## can become infeasible if original graph is not compatible with ISM @@ -1159,39 +1246,6 @@ balance = function(gg, b = rbind(b, edge.indicator.b, fill = TRUE) } - ## REF edge configuration constraint (added by default basically) - ## only add this if there are no unphased nodes - if (cnloh) { - - ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 - ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges - - ## penalize CNLOH edges - - if (!is.null(gg$edges$dt$cnloh)) { - cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique - if (verbose) { - message("Number of marked CNLOH edges: ", length(cnloh.edges)) - } - - ## add CNLOH annotation to variables - ## browser() - vars[, cnloh := FALSE] - vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & - ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges), - ":="(cnloh = TRUE)] - - } else { - warning("CNLOH not specified on edges. Disallowing!") - cnloh.og.edges = c() - vars[, cnloh := FALSE] - } - } else { - - cnloh.og.edges = c() - vars[, cnloh := FALSE] - - } ## add CNLOH constraints for applicable edges @@ -1215,8 +1269,8 @@ balance = function(gg, ## b = rbind(b, rhs, fill = TRUE) ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT) - #' zchoo Wednesday, Sep 01, 2021 10:27:47 AM - ## this set of constraints is no longer necessary as node haplotypes have been added + ## this should be no longer needed!! + ## this introduces redundant constraints for REF edges ## iconstraints.from = unique( ## vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges), @@ -1253,144 +1307,143 @@ balance = function(gg, ## ## add to b ## b = rbind(b, edge.indicator.b, fill = TRUE) - } - if (L0) ## add "big M" constraints - { - ## indicator constraints ie on ulids - iconstraints = rbind( - vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))], - vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ - match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) + if (L0) ## add "big M" constraints + { + ## indicator constraints ie on ulids + iconstraints = rbind( + vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))], + vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))], + fill = TRUE) - ## upper bound sense is 'L' i.e. less than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))], - vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))], - fill = TRUE) + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ + match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) - ## lower bound 0.1 if indicator positive, 0 otherwise - iconstraints = rbind( - vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))], - vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ - match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) + ## upper bound sense is 'L' i.e. less than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))], + vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))], + fill = TRUE) - ## lower bound sense is 'G' i.e. greater than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))], - vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))], - fill = TRUE) + ## lower bound 0.1 if indicator positive, 0 otherwise + iconstraints = rbind( + vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))], + vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ + match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)], + fill = TRUE) - if (loose.collapse) - { - ################## - ## loose indicator sum = sum of indicators - ################## - iconstraints = rbind( - vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))], - vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))], - fill = TRUE) - - ## indicator sum is the sum of all indicators mapping to that loose end - iconstraints = rbind( - iconstraints, - unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ - match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'), - fill = TRUE) - - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) - - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))], + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## lower bound sense is 'G' i.e. greater than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))], + vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))], + fill = TRUE) + + if (loose.collapse) + { +################## + ## loose indicator sum = sum of indicators +################## + iconstraints = rbind( + vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))], + vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))], fill = TRUE) - - ################## - ## now we make new indicator variables on the sum of the individual loose end indicators - ## upper bound bound 0.1 if indicator positive, 0 otherwise - ################## - - iconstraints = rbind( - vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))], - vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))], - fill = TRUE) + + ## indicator sum is the sum of all indicators mapping to that loose end + iconstraints = rbind( + iconstraints, + unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ + match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'), + fill = TRUE) + + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))], + fill = TRUE) + +################## + ## now we make new indicator variables on the sum of the individual loose end indicators + ## upper bound bound 0.1 if indicator positive, 0 otherwise +################## - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][ - match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)], - fill = TRUE) - - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) + iconstraints = rbind( + vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))], + vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))], + fill = TRUE) - ## upper bound sense is 'L' i.e. less than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))], + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][ + match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, fill = TRUE) - ## lower bound 0.1 if indicator positive, 0 otherwise - iconstraints = rbind( - vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))], - vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))], - fill = TRUE) - - ## add the matching indicator variables, matching to the cid from above - iconstraints = rbind( - iconstraints, - vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ - match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)], - fill = TRUE) + ## upper bound sense is 'L' i.e. less than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))], + fill = TRUE) - ## upper bounds "infinity" ie M if indicator positive, 0 otherwise - constraints = rbind( - constraints, - iconstraints, - fill = TRUE) + ## lower bound 0.1 if indicator positive, 0 otherwise + iconstraints = rbind( + vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))], + vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ + match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)], + fill = TRUE) - ## lower bound sense is 'G' i.e. greater than because -M on left hand side - b = rbind(b, - vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))], - vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))], + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, fill = TRUE) - } - } + ## lower bound sense is 'G' i.e. greater than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))], + fill = TRUE) + + } + } if (!is.null(marginal) && length(dmarginal)) @@ -1432,12 +1485,12 @@ balance = function(gg, b = rbind(emb, b, fill = TRUE) } - ######## - ## MAKE MATRICES - ######## +######## + ## MAKE MATRICES +######## - ## now Rcplex time - ## remove any rows with b = NA + ## now Rcplex time + ## remove any rows with b = NA ## get rid of any constraints with NA values keep.constraints = intersect(b[!is.na(value), cid], constraints[!is.na(value), cid]) @@ -1464,50 +1517,50 @@ balance = function(gg, Amat = sparseMatrix(constraints$cid, constraints$id, x = constraints$value, dims = c(length(ucid), nrow(vars))) vars[is.na(weight), weight := 0] - if (verbose) { + if (verbose) { - message("bvec length: ", length(bvec)) - message("Amat nrow: ", nrow(Amat)) + message("bvec length: ", length(bvec)) + message("Amat nrow: ", nrow(Amat)) - } - if (any(ix <- is.infinite(vars$weight))) - { - warning('nodes with infinite weight, setting to 0, please check inputs') - vars[ix, weight := 0] - } - Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix') + } + if (any(ix <- is.infinite(vars$weight))) + { + warning('nodes with infinite weight, setting to 0, please check inputs') + vars[ix, weight := 0] + } + Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix') - ## set lambda to 0 at terminal or other non NA nodes - vars[is.na(lambda), lambda := 0] + ## set lambda to 0 at terminal or other non NA nodes + vars[is.na(lambda), lambda := 0] - - ## set cvec by multiplying global lambda by local lambda for non-terminal loose end - ## vars (or their indicators if L0 is TRUE) - if (L0) + + ## set cvec by multiplying global lambda by local lambda for non-terminal loose end + ## vars (or their indicators if L0 is TRUE) + if (L0) { - if (loose.collapse) - { - cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator') & !terminal)] %>% as.numeric) - ## cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) + if (loose.collapse) + { + cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator') & !terminal)] %>% as.numeric) + ## cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) } - else + else { - cvec = lambda*(vars[, lambda * (type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) + cvec = lambda*(vars[, lambda * (type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) } } else { - cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric) - } + cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric) + } - ## message("CVEC: ", length(cvec)) + ## message("CVEC: ", length(cvec)) - ## implement reward if provided - if (length(ix <- which(vars$reward!=0))) - { - if (verbose) - message('Applying reward') - cvec[ix] = -vars$reward[ix] + ## implement reward if provided + if (length(ix <- which(vars$reward!=0))) + { + if (verbose) + message('Applying reward') + cvec[ix] = -vars$reward[ix] - } + } if (lp) { ## add weights of stuff @@ -1534,12 +1587,12 @@ balance = function(gg, ## browser() ## vars[type == "edge.indicator" & cnloh == TRUE] ## vars[type == "edge.indicator" & cnloh == TRUE, .N, by = og.edge.id] - + - lb = vars$lb - ub = vars$ub + lb = vars$lb + ub = vars$ub - control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1, trelim = trelim, nodefileind = nodefileind) + control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1, trelim = trelim, nodefileind = nodefileind) ## call our wrapper for CPLEX sol = Rcplex2(cvec, @@ -1554,82 +1607,92 @@ balance = function(gg, control = control, tuning = FALSE) - vars$cvec = cvec - vars$x = sol$x + vars$cvec = cvec + vars$x = sol$x - ## for debugging - ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] - - ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} + ## for debugging + ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] + + ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} - .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))), - sense) - chk = .check(sol$x) + .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))), + sense) + chk = .check(sol$x) - if (any(is.na(sol$x))) - stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output') + if (any(is.na(sol$x))) + stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output') - if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] | - chk[sense == 'G', any(obs < 0, na.rm = TRUE)] | - chk[sense == 'L', any(obs > 0, na.rm = TRUE)]) - stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter') + if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] | + chk[sense == 'G', any(obs < 0, na.rm = TRUE)] | + chk[sense == 'L', any(obs > 0, na.rm = TRUE)]) + stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter') - ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x + ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x - - ## update graph - nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))] - emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))] + + ## update graph + nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))] + emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))] - loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id] - looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id] + ## add haplotype label if available + if (phased) { + ## browser() + tmp = vars[type == "haplotype", .(sum = sum(x)), by = og.node.id] + ## vars[type == "haplotype", .(sum = sum(x)), by = og.node.id][sum != 1, og.node.id] + ## vars[og.node.id == 1 & type == "haplotype"] + haplotypes = vars[type == "haplotype", .(nid = abs(snode.id), haplotype = ifelse(x == 1, "h1", "h2"))] + gg$nodes[haplotypes$nid]$mark(haplotype = haplotypes$haplotype) + } - nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings - nodes$loose.left = loosei$cn>0 + loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id] + looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id] - nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings - nodes$loose.right = looseo$cn>0 + nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings + nodes$loose.left = loosei$cn>0 - gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0) - gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn) - gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn) + nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings + nodes$loose.right = looseo$cn>0 - ## cache old cn values - gg$nodes$mark(cn.old = gg$nodes$dt$cn) - gg$edges$mark(cn.old = gg$edges$dt$cn) - gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue - gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue - gg$nodes[nmark$nid]$mark(cn = nmark$cn) - gg$edges[emark$eid]$mark(cn = emark$cn) - gg$set(y.field = 'cn') + gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0) + gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn) + gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn) - gg$set(obj = sol$obj) + ## cache old cn values + gg$nodes$mark(cn.old = gg$nodes$dt$cn) + gg$edges$mark(cn.old = gg$edges$dt$cn) + gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue + gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue + gg$nodes[nmark$nid]$mark(cn = nmark$cn) + gg$edges[emark$eid]$mark(cn = emark$cn) + gg$set(y.field = 'cn') -## fix loose ends - nodes = gg$nodes - nodes$loose.left = nodes$dt$loose.cn.left>0 - nodes$loose.right = nodes$dt$loose.cn.right>0 + gg$set(obj = sol$obj) - ## if phased, mark edges with different colors to make it easier to visualize - if (phased) { - if (verbose) { - message("formatting phased graph...") + ## fix loose ends + nodes = gg$nodes + nodes$loose.left = nodes$dt$loose.cn.left>0 + nodes$loose.right = nodes$dt$loose.cn.right>0 + + ## if phased, mark edges with different colors to make it easier to visualize + if (phased) { + if (verbose) { + message("formatting phased graph...") + } + ## edge formatting + ref.edge.col = alpha("blue", 0.2) + alt.edge.col = alpha("red", 0.4) + ref.edge.lwd = 0.5 + alt.edge.lwd = 1.0 + edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col) + edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd) + gg$edges$mark(col = edge.col, lwd = edge.lwd) + + ## mark zero cn edges + zero.cn.col = alpha("gray", 0) + zero.cn.lwd = 0.5 + zero.cn.edges = which(gg$edges$dt$cn == 0) + gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd) } - ## edge formatting - ref.edge.col = alpha("blue", 0.2) - alt.edge.col = alpha("red", 0.4) - ref.edge.lwd = 0.5 - alt.edge.lwd = 1.0 - edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col) - edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd) - gg$edges$mark(col = edge.col, lwd = edge.lwd) - - ## mark zero cn edges - zero.cn.col = alpha("gray", 0) - zero.cn.lwd = 0.5 - zero.cn.edges = which(gg$edges$dt$cn == 0) - gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd) - } if (debug) { return(list(gg = gg, sol = sol)) } @@ -2563,7 +2626,6 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) { ## reduce NA nodes na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE) - ## browser() ## if there are magically not any of these, return if (length(na.nodes.gr) == 0) { @@ -2572,12 +2634,6 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) { na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node") - ## na.nodes.gr = gr.val(na.nodes.gr, target = gg$nodes$gr[, "nbins"], - ## val = "nbins", - ## weighted = FALSE, - ## FUN = sum, - ## na.rm = TRUE) - ## only keep relatively large NA regions... for isolated small ROH we may not want to do this. na.nodes.gr = na.nodes.gr %Q% (width(na.nodes.gr) > min.width) @@ -2661,6 +2717,37 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) { return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr))) } +#' @name find_na_nodes_helper +#' @title find_na_nodes_helper +#' +#' @ description +#' +#' An extremely simple function that calls ROH based on gaps between hets +#' +#' @param gg (gGraph) +#' @param hets (GRanges) +#' @param min.width (numeric) minimum ROH width +#' @param phase.blocks (GRanges) NOT IMPLEMENTED YET +#' @param verbose (logical) default 1 +#' +#' @return GRanges representing nodes falling within ROH with width > min.width +find_na_nodes_helper = function(gg, hets, + min.width = 1e4, + phase.blocks = NULL, + verbose = TRUE) { + + ## get unique GRanges for hets + unique.hets = gr.stripstrand(unique(hets[, c()])) + all.gaps = gaps(unique.hets) + big.gaps = all.gaps %Q% (strand(all.gaps)=="*" & width(all.gaps) > min.width) + big.gaps$roh.id = 1:length(big.gaps) + start.gaps = gr.start(gg$nodes$gr[, c()]) %$% big.gapsn + end.gaps = gr.end(gg$nodes$gr[, c()]) %$% big.gaps + unphased.nodes = which(start.gaps$roh.id == end.gaps$roh.id) + unphased.gr = gg$nodes[unphased.nodes]$gr[, c()] + return(unphased.gr) +} + #' @name unphase_na_ranges #' @title unphase_na_ranges #' @@ -2800,6 +2887,7 @@ unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = N #' #' @param gg junction-balanced phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn #' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1 +#' @param min.width (numeric) min width (bp) for a region to be considered a ROH #' @param phase.blocks (GRanges) granges of phase blocks from linked reads. default = NULL #' @param mc.cores (int) number of cores. default = 8. #' @param verbose (bool) verbose > 0 prints stuff. default 1. @@ -2807,7 +2895,11 @@ unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = N #' @return balanced gGraph with unphased nodes marked and compressed #' #' @export -phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = 8, verbose = 1) +phased.postprocess = function(gg, + min.bins = 1, + min.width = 1e4, + phase.blocks = NULL, + mc.cores = 8, verbose = 1) { ## check that gg nodes and edges have og node if (!("og.node.id" %in% colnames(gg$nodes$dt)) | !("og.edge.id" %in% colnames(gg$edges$dt))) { @@ -2901,7 +2993,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = } if (min.bins) { - na.gr = find_na_ranges(gg, min.bins = min.bins) + na.gr = find_na_ranges(gg, min.bins = min.bins, min.width = min.width) all.seed.gr = gr.reduce(c(seed.gr, na.gr)) } else { na.gr = GRanges() @@ -3112,7 +3204,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9) #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3) #' @param min.var (numeric) min allowable variance (default 0.1) -#' @param max.span (numeric) max span before penalizing CNLOH +#' @param max.span (numeric) max span before penalizing CNLOH (NOT USED??) #' @param verbose (bool) default TRUE for debugging #' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp #' @param mc.cores (int) number of cores @@ -3127,9 +3219,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, edge.phase.dt = NULL, vbase.count.thres = 5, vbase.prop.thres = 0.9, min.bins = 3, min.var = 1e-3, - max.span = 1e6, + max.span = 0, min.width = 5e3, - fix.tiny = TRUE, + fix.tiny = FALSE, verbose = TRUE, mc.cores = 8) { if (verbose) { @@ -3236,7 +3328,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, gg$edges$mark(cnloh = FALSE) gg$edges[internal.edges]$mark(cnloh = TRUE) - browser() + ## browser() if (verbose) { message("Number of internal edges marked in parent graph: ", length(internal.edges)) @@ -3245,59 +3337,59 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, gg$edges$mark(cnloh = FALSE) } - if (verbose) { - message("Marking pseudo-CNLOH") - } - - ## pull intra-chromosomal ALT edges, because these are cnloh candidates - pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")] - pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt - - if (nrow(pseudo.cnloh.junctions.dt)) { - - ## grab span - limit to below max.span - pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span] - - ## compute node overlap with shadow - ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr - ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap] - - ## mark candidates - ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & - ## class == "DEL-like" & - ## node.overlap.count <= 3, edge.id], - ## pseudo.cnloh.junctions.dt[span < max.span & - ## class == "INV-like" & - ## node.overlap.count <= 2, edge.id], - ## pseudo.cnloh.junctions.dt[span < max.span & - ## class == "DUP-like" & - ## node.overlap.count <= 1, edge.id]) - - #' zchoo Friday, Jul 30, 2021 11:31:11 AM - ## changed this to include any junction with span under max.span - pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & - class == "DEL-like", edge.id], - pseudo.cnloh.junctions.dt[span < max.span & - class == "INV-like", edge.id], - pseudo.cnloh.junctions.dt[span < max.span & - class == "DUP-like", edge.id]) + ## if (verbose) { + ## message("Marking pseudo-CNLOH") + ## } + + ## ## pull intra-chromosomal ALT edges, because these are cnloh candidates + ## pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")] + ## pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt + + ## if (nrow(pseudo.cnloh.junctions.dt)) { + + ## ## grab span - limit to below max.span + ## pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span] + + ## ## compute node overlap with shadow + ## ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr + ## ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap] + + ## ## mark candidates + ## ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & + ## ## class == "DEL-like" & + ## ## node.overlap.count <= 3, edge.id], + ## ## pseudo.cnloh.junctions.dt[span < max.span & + ## ## class == "INV-like" & + ## ## node.overlap.count <= 2, edge.id], + ## ## pseudo.cnloh.junctions.dt[span < max.span & + ## ## class == "DUP-like" & + ## ## node.overlap.count <= 1, edge.id]) + + ## #' zchoo Friday, Jul 30, 2021 11:31:11 AM + ## ## changed this to include any junction with span under max.span + ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span & + ## class == "DEL-like", edge.id], + ## pseudo.cnloh.junctions.dt[span < max.span & + ## class == "INV-like", edge.id], + ## pseudo.cnloh.junctions.dt[span < max.span & + ## class == "DUP-like", edge.id]) - ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id] - gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE) + ## ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id] + ## gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE) - if (verbose) { - message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges)) - } + ## if (verbose) { + ## message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges)) + ## } - } else { + ## } else { - gg$edges$mark(cnloh = FALSE) + ## gg$edges$mark(cnloh = FALSE) - if (verbose) { - message("No pseudo-CNLOH edges detected.") - } + ## if (verbose) { + ## message("No pseudo-CNLOH edges detected.") + ## } - } + ## } @@ -3390,6 +3482,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, dt = ov[, .(mean = mean(cn, na.rm = T), var = var(cn, na.rm = T), nbins = .N), by = query.id] ov.match = match(1:length(phased.gg.nodes), dt$query.id) + ## make sure that the variance is at least as large as the mean + dt[var < mean, var := mean] + ## add information to nodes phased.gg.nodes$cn = dt$mean[ov.match] phased.gg.nodes$var = dt$var[ov.match] @@ -3402,6 +3497,14 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, (phased.gg.nodes$nbins / (2 * sqrt(phased.gg.nodes$var))), NA) + ## unset weights if far from marginal + marginal.dt = as.data.table(phased.gg.nodes)[, .(marginal.cn, og.node.id, cn, allele)] + marginal.dt[, delta.estimate := marginal.cn - sum(.SD$cn), by = og.node.id] + bad.major.nodes = which(marginal.dt[, delta.estimate] > 5 & marginal.dt[, allele == "major"]) + ## browser() + phased.gg.nodes$weight[bad.major.nodes] = 0 + phased.gg.nodes$cn[bad.major.nodes] = NA + if (verbose) { message("Preparing phased gGraph edges") } @@ -3545,14 +3648,6 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, message("Fixing very tiny nodes") } - ## browser() - ## tiny.nodes = phased.gg$nodes$dt[width < min.width, node.id] - ## tiny.edges = phased.gg$edges$dt[(n1 %in% tiny.nodes | n2 %in% tiny.nodes) & - ## type == "REF" & - ## connection == "cross", edge.id] - - ## phased.gg$edges[tiny.edges]$mark(ub = 0, lb = 0) - tmp = phased.gg$junctions$dt[, .(edge.id, type, class, connection)] tmp[, span := phased.gg$junctions$span] @@ -3583,6 +3678,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, phased.gg$nodes$mark(col = ifelse(phased.gg$nodes$dt$allele == "major", major.node.col, minor.node.col), ywid = 0.8) + ## mark edge span + phased.gg$edges$mark(span = phased.gg$junctions$span) + return(phased.gg) } From bbc6ffdc224fb7cf9e877a735f45f946d11e7e7d Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Tue, 22 Mar 2022 18:33:44 -0400 Subject: [PATCH 32/35] remove some merge conflict symbols --- R/apps.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/apps.R b/R/apps.R index 34c8faef..755cab5f 100644 --- a/R/apps.R +++ b/R/apps.R @@ -2175,7 +2175,6 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, return(gg) } -<<<<<<< HEAD #' @name find_na_ranges #' @title find_na_ranges #' @@ -3077,7 +3076,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL, phased.gg.edges[n1.chr == n2.chr & n1.allele == n2.allele, connection := "straight"] phased.gg.edges[n1.chr == n2.chr & n1.allele != n2.allele, connection := "cross"] -<<<<<<< HEAD + ## add phase block information to edges (for linked reads) if (!is.null(phase.blocks)) { From 858c3a6f38d4aae5489825a17b184187d0dfdb05 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Thu, 21 Apr 2022 15:47:23 -0400 Subject: [PATCH 33/35] update binstats --- R/apps.R | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/R/apps.R b/R/apps.R index fb883010..7452473b 100644 --- a/R/apps.R +++ b/R/apps.R @@ -442,6 +442,14 @@ balance = function(gg, } } + ## add telomeric annotation + qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + term.out = -term.in ## out is reciprocal of in + + ## annotate loose indicators with this + vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)] if (phased) { ## add allele information and og.node.id @@ -2834,8 +2842,187 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = #' Given a balanced unphased genome graph with field $cn populated with integer values #' Constructs a "melted" haplotype graph with fields $cn and $allele #' +#' In addition the following fields are populated in the node metadata +#' - cn +#' - allele +#' - weight +#' +#' And, the following fields are populated in the edge metadata: +#' - connection (either straight or cross, depending on whether the edge goes from major to minor +#' - n1.allele (either major or minor) +#' - n2.allele (either major or minor) +#' - n1.chr (n1 seqname) +#' - n2.chr (n2 seqname) +#' #' @param gg (gGraph) gGraph with field $cn populated with integer copy number -#' @param hets +#' @param bins (GRanges) with fields count.field and allele.field +#' @param purity (numeric) sample purity, default gg$meta$purity +#' @param ploidy (numeric) sample ploidy, default gg$meta$ploidy (not needed?) +#' @param allele.field (character) the field in bins giving major/minor allele, default $allele +#' @param count.field (character) the field in bins giving SNP read counts, default $count +#' @param verbose (logical) print stuff? default FALSE +#' +#' @return melted gGraph +phased.binstats = function(gg, bins, + purity = gg$meta$purity, + ploidy = gg$meta$ploidy, + count.field = "count", + allele.field = "allele", + verbose = FALSE) +{ + + ## helper function to compute ploidy at het SNPs + ## (might be slightly different from overall ploidy) + ## (due to uneven distribution of hets) + local.ploidy = function(gg, sites) { + sites = sites[, c()] %$% gg$nodes$gr[, "cn"] + return(mean(values(sites)[, "cn"], na.rm = TRUE)) + } + + ## transform relative CN (SNP counts) to absolute multiplicity + ## given purity and ploidy + reads.to.allele.cn = function(bins, count.field, purity, ploidy) { + y = values(bins)[[count.field]] + y.bar = 2 * mean(y, na.rm = TRUE) + + ## purity and ploidy + alpha = purity + tau = ploidy + + ## linear equation + denom = alpha * tau + 2 * (1 - alpha) + beta = (y.bar * alpha) / denom + gamma =(y.bar * (1 - alpha)) / denom + + cn = (y - gamma) / beta + return(cn) + } + + if (verbose) { + message("Checking inputs") + } + if (is.null(purity)) { + warning("Purity not provided, setting to 1.0") + purity = 1 + } + if (is.null(ploidy)) { + warning("Ploidy not provided, setting to 2.0") + ploidy = 2 + } + if (!inherits(bins, 'GRanges')) { + stop("bins must be GRanges") + } + if (!(count.field %in% names(values(bins)))) { + stop("count.field not found in bins metadata") + } + if (!(allele.field %in% names(values(bins)))) { + stop("allele.field not found in bins metadata") + } + allele.values = unique(values(bins)[[allele.field]]) + if (!("major" %in% allele.values) | !("minor" %in% allele.values)) { + stop("allele.field must contain labels 'major' and 'minor'") + } + + if (verbose) { message("Getting ploidy from gGraph") } + ploidy = local.ploidy(gg, bins %Q% (values(bins)[, allele.field] == "major")) + + if (verbose) { message("Performing allelic rel2abs transformation") } + allele.cn.bins = bins[, c()] + values(allele.cn.bins)[, "acn"] = reads.to.allele.cn(bins, count.field, purity, ploidy) + values(allele.cn.bins)[, "allele"] = values(bins)[, allele.field] + + ## STRETCH OUT THE BINS! + if (verbose) { message("Stretching out bins") } + allele.cn.bins = gr.stripstrand(allele.cn.bins) + bin.gaps = gaps(allele.cn.bins) + bin.gaps = bin.gaps %Q% (strand(bin.gaps) == "*") + bin.gaps = resize(bin.gaps, width = width(bin.gaps) + 1, fix = "start") + bin.gaps = gr.val(query = bin.gaps[, c()], + target = allele.cn.bins %Q% (allele == "minor"), + val = "acn", + mean = TRUE, + na.rm = TRUE) + + if (verbose) { message("Evaluating allelic bins over graph") } + cn.dt = gr.findoverlaps(query = gg$nodes$gr[, c()], + subject = bin.gaps, + return.type = "data.table") + + cn.dt[, minor.cn := values(bin.gaps)[, "acn"][subject.id]] + + ## collapse by node + collapsed.cn.dt = cn.dt[, .(minor.cn = mean(minor.cn, na.rm = TRUE), + nsites = .N, + variance = var(minor.cn, na.rm = TRUE)), + by = query.id] + + collapsed.cn.dt[, cn.total := values(gg$nodes$gr)[, "cn"][query.id]] + collapsed.cn.dt[, major.cn := cn.total - minor.cn] + collapsed.cn.dt[, corrected.variance := pmax(pmax(variance, minor.cn), 1)] + collapsed.cn.dt[, weight := nsites / sqrt(corrected.variance)] + + ## duplicate ranges to make phased gGraph nodes + og.nodes.gr = gg$nodes$gr[, c()] + values(og.nodes.gr)[, "og.node.id"] = values(gg$nodes$gr)[, "node.id"] + pmt = match(1:length(og.nodes.gr), collapsed.cn.dt[, query.id]) + values(og.nodes.gr)[, "cn.total"] = collapsed.cn.dt[, cn.total][pmt] + values(og.nodes.gr)[, "cn.high"] = collapsed.cn.dt[, major.cn][pmt] + values(og.nodes.gr)[, "cn.low"] = collapsed.cn.dt[, minor.cn][pmt] + values(og.nodes.gr)[, "weight"] = collapsed.cn.dt[, weight][pmt] + + ## prepare nodes for melted graph + phased.gg.nodes = c(og.nodes.gr, og.nodes.gr) + values(phased.gg.nodes)[, "cn"] = c(values(og.nodes.gr)[, "cn.high"], values(og.nodes.gr)[, "cn.low"]) + values(phased.gg.nodes)[, "allele"] = c(rep("major", length(og.nodes.gr)), rep("minor", length(og.nodes.gr))) + + ## prepare edges for melted graph + phased.gg.edges = rbind( + gg$edges$dt[, .(n1, n2, n1.side, n2.side, type, + og.edge.id = edge.id, + n1.allele = "major", + n2.allele = "major")], + gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2 = n2 + length(og.nodes.gr), type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "minor", + n2.allele = "minor")], + gg$edges$dt[, .(n1, n2 = n2 + length(og.nodes.gr), type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "major", + n2.allele = "minor")], + gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2, type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "minor", + n2.allele = "major")] + ) + + ## add n1/n2 chromosome information + phased.gg.edges[, ":="(n1.chr = seqnames(phased.gg.nodes)[n1] %>% as.character, + n2.chr = seqnames(phased.gg.nodes)[n2] %>% as.character)] + + ## add edge connection type (straight/cross) + phased.gg.edges[n1.chr == n2.chr & n1.allele == n2.allele, connection := "straight"] + phased.gg.edges[n1.chr == n2.chr & n1.allele != n2.allele, connection := "cross"] + + phased.gg = gG(nodes = phased.gg.nodes, edges = phased.gg.edges) + + ref.edge.col = alpha("blue", 0.3) + alt.edge.col = alpha("red", 0.3) + ref.edge.lwd = 0.5 + alt.edge.lwd = 1.0 + phased.gg$edges$mark(col = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col), + lwd = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd)) + + major.node.col = alpha("red", 0.5) + minor.node.col = alpha("blue", 0.5) + phased.gg$nodes$mark(col = ifelse(phased.gg$nodes$dt$allele == "major", major.node.col, minor.node.col), + ywid = 0.8) + + return(phased.gg) +} + #' @name phased.binstats.legacy From a8929eb8642449d9ad4189467647d3e87a3eaf03 Mon Sep 17 00:00:00 2001 From: Zi-Ning Choo Date: Mon, 30 May 2022 16:29:44 -0400 Subject: [PATCH 34/35] check in changes for phasing --- R/apps.R | 106 ++++++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 64 deletions(-) diff --git a/R/apps.R b/R/apps.R index ca14d751..5e9755a6 100644 --- a/R/apps.R +++ b/R/apps.R @@ -509,7 +509,7 @@ balance = function(gg, ## browser() vars[, cnloh := FALSE] vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & - ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges), + ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges) & (sedge.id > 0), ":="(cnloh = TRUE)] } else { @@ -674,9 +674,9 @@ balance = function(gg, ## vars[type %in% c('node', 'edge'), lb := ifelse(is.na(lb), 0, pmax(lb, 0, na.rm = TRUE)] ## vars[type %in% c('node', 'edge'), ub := ifelse(is.na(ub), M, pmin(ub, M, na.rm = TRUE))] vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)] - - vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)] + ## reward shouldn't have to be positive + ## vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)] ## figure out junctions and nodes to fix @@ -1089,42 +1089,44 @@ balance = function(gg, ## if (!phased) { ## extremity exclusivity (relevant for ALL graphs) - loose.constraints = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] - ) + if (ism) { + loose.constraints = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) - edge.constraints = rbind( - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))], - vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))] - ) + edge.constraints = rbind( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))], + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))] + ) - constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) - loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid") - edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid") - b = rbind(b, edge.b, loose.b, fill = TRUE) + b = rbind(b, edge.b, loose.b, fill = TRUE) - ## fix loose ends at zero if they coincide with a called junction - edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) - edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] + ## fix loose ends at zero if they coincide with a called junction + edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) + edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] - loose.zeros = rbind( - vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], - vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, - .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] - ) + loose.zeros = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) - loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") + loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") - constraints = rbind(constraints, loose.zeros, fill = TRUE) - b = rbind(b, loose.zeros.rhs, fill = TRUE) + constraints = rbind(constraints, loose.zeros, fill = TRUE) + b = rbind(b, loose.zeros.rhs, fill = TRUE) + } } if (phased) { @@ -1162,31 +1164,6 @@ balance = function(gg, nrow(rhs)) } - ## grab node ids associated with ALT edges on the left - ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1], - ## gg$edges$dt[n2.side == "left" & type == "ALT", n2]) - ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1], - ## gg$edges$dt[n2.side == "right" & type == "ALT", n2]) - - ## ## fix loose ends for these nodes to zero - ## #' zchoo Thursday, Sep 02, 2021 11:13:44 AM - ## #' removed these constraints to see if feasibility improves - ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids), - ## ":="(lb = 0, ub = 0)] - ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids), - ## ":="(lb = 0, ub = 0)] - - ## if (verbose) { - ## message("Number of homologous loose ends: ", - ## length(left.og.node.ids) + length(right.og.node.ids)) - ## } - - ## reciprocal homologous extremity exclusivity - ## implement configuration indicators (OR constraint) config.dt = vars[type == "straight.config" | type == "cross.config",] config.constraints.lt = rbind( @@ -1256,11 +1233,6 @@ balance = function(gg, .(value = 1, id, cid = paste("rhee", c))] ) - ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge) - ## rhomol.constraints[, n.entries := .N, by = cid] - ## remove this filter! due to some loose end violations! - ## rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)] - rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid") if (verbose) { @@ -1323,7 +1295,7 @@ balance = function(gg, } iconstraints = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE, + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0, .(value = 1, id, og.edge.id, edge.id = abs(sedge.id), cid = paste("edge.indicator.sum.lb", og.edge.id))], @@ -1336,7 +1308,7 @@ balance = function(gg, fill = TRUE) edge.indicator.b = unique( - vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE, + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0, .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))], by = "cid" ) @@ -1551,7 +1523,6 @@ balance = function(gg, ## now Rcplex time ## remove any rows with b = NA - ## get rid of any constraints with NA values keep.constraints = intersect(b[!is.na(value), cid], constraints[!is.na(value), cid]) b = b[cid %in% keep.constraints,] @@ -3140,6 +3111,13 @@ phased.binstats = function(gg, bins, n2.allele = "major")] ) + ## transfer edge metadata + cols = intersect(setdiff(names(gg$edges$dt), names(phased.gg.edges)), c("cnloh")) + if (length(cols)) { + pmt = match(phased.gg.edges[, og.edge.id], gg$edges$dt[, edge.id]) + phased.gg.edges = cbind(phased.gg.edges, gg$edges$dt[pmt, ..cols]) + } + ## add n1/n2 chromosome information phased.gg.edges[, ":="(n1.chr = seqnames(phased.gg.nodes)[n1] %>% as.character, n2.chr = seqnames(phased.gg.nodes)[n2] %>% as.character)] From b9359b1768419483d66e4a76e8de0197a35e341f Mon Sep 17 00:00:00 2001 From: SplitInf Date: Tue, 24 May 2022 23:56:00 -0400 Subject: [PATCH 35/35] Update converters.R add break condition when reached the end of bedpe file --- R/converters.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/converters.R b/R/converters.R index f406049b..89587950 100755 --- a/R/converters.R +++ b/R/converters.R @@ -937,6 +937,7 @@ read.juncs = function(rafile, while (grepl("^((#)|(chrom)|(chr))", thisline)) { headers = c(headers, thisline) thisline = readLines(f, 1) + if(length(thisline)==0) break } ln = sum(length(headers), length(thisline)) while (length(thisline) > 0) {