From c1342d4d0fc2ed5e35854a9bccd4bc8a6206cc7b Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Fri, 30 Apr 2021 16:00:10 -0400
Subject: [PATCH 01/35] cleanup

---
 .gitignore                     |   1 +
 src/Makevars                   |   3 -
 src/Makevars.win               |   7 -
 src/Rcplex2.c                  | 375 ---------------------------------
 src/Rcplex2.h                  |  28 ---
 src/Rcplex_params.c            | 261 -----------------------
 src/Rcplex_utils.c             | 102 ---------
 srcs/src.blank/Makevars        |   3 -
 srcs/src.blank/Makevars.in     |   3 -
 srcs/src.blank/Makevars.win    |   7 -
 srcs/src.blank/Rcplex2.c       |  23 --
 srcs/src.blank/Rcplex2.h       |   6 -
 srcs/src.rcplex/Rcplex2.c      |   2 +
 srcs/src.rcplex/Rcplex2.h      |  10 +-
 srcs/src.rcplex/Rcplex_utils.c |   5 +
 15 files changed, 13 insertions(+), 823 deletions(-)
 delete mode 100644 src/Makevars
 delete mode 100644 src/Makevars.win
 delete mode 100644 src/Rcplex2.c
 delete mode 100644 src/Rcplex2.h
 delete mode 100644 src/Rcplex_params.c
 delete mode 100644 src/Rcplex_utils.c
 delete mode 100644 srcs/src.blank/Makevars
 delete mode 100644 srcs/src.blank/Makevars.in
 delete mode 100644 srcs/src.blank/Makevars.win
 delete mode 100644 srcs/src.blank/Rcplex2.c
 delete mode 100644 srcs/src.blank/Rcplex2.h

diff --git a/.gitignore b/.gitignore
index eb8b3afc..05fc351c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,6 +71,7 @@ docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.RData
 docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.rdb
 docs/debug.html
 docs/tutorial_2_files/figure-html/duplications and pyrgo-1.png
+src/
 tests/testthat/circos.pdf
 tests/testthat/test.json
 tests/testthat/test.pdf
diff --git a/src/Makevars b/src/Makevars
deleted file mode 100644
index 9d19401f..00000000
--- a/src/Makevars
+++ /dev/null
@@ -1,3 +0,0 @@
-PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing
-PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include
-PKG_LIBS=-L${CPLEX_DIR}/cplex/lib/x86-64_linux/static_pic -lcplex -lm -lpthread
diff --git a/src/Makevars.win b/src/Makevars.win
deleted file mode 100644
index 5c9fb6d9..00000000
--- a/src/Makevars.win
+++ /dev/null
@@ -1,7 +0,0 @@
-ifeq "$(WIN)" "64"
-PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm
-PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include"
-else
-PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm
-PKG_CPPFLAGS = -I"${CPLEX_DIR}/include"
-endif
diff --git a/src/Rcplex2.c b/src/Rcplex2.c
deleted file mode 100644
index 83c922b3..00000000
--- a/src/Rcplex2.c
+++ /dev/null
@@ -1,375 +0,0 @@
-// The actual solving procedure is called using the function Rcplex 
-#include "Rcplex2.h"
-
-int max_numcalls;
-
-SEXP Rcplex2(SEXP numcols_p,
-	     SEXP numrows_p,
-	     SEXP objsen_p,
-	     SEXP cvec,
-	     SEXP bvec,
-	     SEXP Amat,
-	     SEXP Qmat,
-	     SEXP lb_p,
-	     SEXP ub_p,
-	     SEXP Rsense,
-	     SEXP Rvtype,
-	     SEXP isQP_p,
-	     SEXP isMIP_p,
-	     SEXP num_poplim,
-	     SEXP control,
-	     SEXP maxcalls,
-	     SEXP tuning)
-{
-  char *probname = "Rcplex";
-  int numcols    = INTEGER(numcols_p)[0]; 
-  int numrows    = INTEGER(numrows_p)[0];
-  int objsen     = INTEGER(objsen_p)[0];
-  double *lb     = REAL(lb_p); 
-  double *ub     = REAL(ub_p);
-
-  char sense[numrows];
-  char vtype[numcols];
-  //  double dj[numrows];
-  int isQP       = INTEGER(isQP_p)[0];
-  int isMIP      = INTEGER(isMIP_p)[0];
-  int trace      = INTEGER(getListElement(control,"trace"))[0];
-
-  SEXP res = NULL; /* set to avoid uninitialized warning */
-  SEXP xopt;
-  SEXP epgap;
-  SEXP obj;
-  SEXP solstat;
-  SEXP extra;
-  SEXP lambda;
-  SEXP slack;
-  SEXP nodecnt;
-
-  int status;
-  int i, j;
-  int cur_numrows, cur_numcols;
-
-  int tstat;
-  /*
-   * solution pools not supprted until cplex 11.0
-   */
-#if CPX_VERSION >= 1100 
-  int num_sol = 1;
-  SEXP tmp;
-#endif
-
-  /* set maxnumcalls before init */
-  max_numcalls = INTEGER(maxcalls)[0];
-  /* initialize CPLEX environment */
-  Rcplex_init();
-
-  if(trace) Rprintf("Rcplex: num variables=%d num constraints=%d\n",numcols,numrows);
-
-  /*
-   * solution pools not supported until cplex 11.0
-   */
-#if CPX_VERSION < 1100
-  if (INTEGER(num_poplim)[0] > 1) {
-    warning("Multiple solutions not supported in CPLEX version");
-    INTEGER(num_poplim)[0] = 1;
-  }
-#endif 
-
-  /* lb and ub */
-  for (j = 0; j < numcols; ++j) {
-    lb[j] = R_finite(lb[j]) ? lb[j] : -CPX_INFBOUND;
-    ub[j] = R_finite(ub[j]) ? ub[j] : CPX_INFBOUND;
-  }
-
-  /* set constraint inequality directions */
-  for (i = 0; i < numrows; ++i) {
-    sense[i] = CHAR(STRING_ELT(Rsense, i))[0];
-  }
-
-  /* set variable types */
-  if (isMIP) {
-    for (i = 0; i < numcols; ++i) {
-      vtype[i] = CHAR(STRING_ELT(Rvtype, i))[0];
-    }
-  }
-
-  /* set parameters given in control list */
-  setparams(env, control, isQP, isMIP);
-  lp = CPXcreateprob(env, &status, probname);
-
-
-  /* check memory problems */
-  if (lp == NULL) {
-    my_error(("Failed to create LP.\n"));
-  }
-
-  /* copy problem data into lp */
-  status = CPXcopylp(env, lp, numcols, numrows, objsen,REAL(cvec),
-		     REAL(bvec), sense,
-		     INTEGER(VECTOR_ELT(Amat, 0)), 
-		     INTEGER(VECTOR_ELT(Amat, 1)),
-		     INTEGER(VECTOR_ELT(Amat, 2)),
-		     REAL(VECTOR_ELT(Amat, 3)),
-		     lb, ub, NULL);
-  if (status) {
-    my_error(("Failed to copy problem data.\n"));
-  }
-
-  if (isQP) {
-    status = CPXcopyquad(env, lp,
-			 INTEGER(VECTOR_ELT(Qmat, 0)),
-			 INTEGER(VECTOR_ELT(Qmat, 1)),
-			 INTEGER(VECTOR_ELT(Qmat, 2)),
-			 REAL(VECTOR_ELT(Qmat, 3)));
-    if (status) {
-      my_error(("Failed to copy quadratic term of problem data.\n"));
-    }
-  }
-
-  if (isMIP) {
-    status = CPXcopyctype(env, lp, vtype);
-    if (status) {
-      my_error(("Failed to copy vtype.\n"));
-    }
-  }
-
-  /* solve problem */
-  if(isMIP) {
-
-    /* MARCIN ADDED set starts */
-    setstarts(env, lp, control, isMIP);
-
-    int do_tuning = asLogical(tuning);
-    if (do_tuning){
-      /* set the tuning total tilim to hard-setted value 600s */
-      SEXP tuning_control = setListElement(control, "tilim", 600);
-      /* temporarily change the tilim*/
-      setparams(env, tuning_control, isQP, isMIP);
-      
-      /* tune the hidden parameters */
-      status = CPXtuneparam(env, lp,
-			    0, NULL, NULL, // zero fixed int par
-			    0, NULL, NULL,
-			    0, NULL, NULL, &tstat);
-      if (status) {
-	my_error(("Failed to tune params."));
-      }
-      /* Write the optimized parameters to file */
-      status = CPXwriteparam (env, "tuned.prm");
-
-      setparams(env, control, isQP, isMIP);
-      /* MARCIN ADDED set starts */
-      setstarts(env, lp, control, isMIP);
-    } else {
-      /* Write the optimized parameters to file */
-      status = CPXwriteparam (env, "raw.prm");
-    }
-
-    status = CPXmipopt(env, lp);
-    
-    /*
-     * solutions pool not supported for versions of cplex < 11
-     * e.g., CPX_PARAM_POPULATELIM is not defined
-     */
-
-#if CPX_VERSION >= 1100
-    if(INTEGER(num_poplim)[0] > 1){
-      /* in MIPs it is possible to have more than 1 solution. If
-	 num_poplim > 1 we now try to find these solutions.
-	 For populating the solution pool a couple of parameters are
-	 relevant:
-	 1. the 'absolute gap' solution pool parameter
-	 CPX_PARAM_SOLNPOOLAGAP (see Rcplex C control parameters), 
-	 2. the 'solution pool intensity' parameter
-	 CPX_PARAM_SOLNPOOLINTENSITY (see Rcplex C control parameters),
-	 3. the 'limit on number of solutions generated' for solution
-	 pool CPX_PARAM_POPULATELIM */
-      status = CPXsetintparam(env, CPX_PARAM_POPULATELIM, 
-			      INTEGER(num_poplim)[0] - 1);
-      if (status) {
-	my_error(("Failed to set 'populate limit' parameter.\n"));
-      }
-      /* now populate the solutions pool */
-      status = CPXpopulate(env, lp);
-    }
-#endif
-  }
-  else if (isQP) {
-    status = CPXqpopt(env, lp);
-  }
-  else {
-    status = CPXlpopt(env, lp);
-  }
-  
-  /* a status value of zero does not necessarily mean that an optimal
-     solution exists. Examples of an exit status of non-zero here
-     include CPXERR_NO_PROBLEM, CPXERR_NO_MEMORY, ... */
-  if (status) {
-    my_error(("Failed to optimize problem."));
-  }
-  
-  PROTECT(solstat = allocVector(INTSXP,  1));
-
-  /* retrieve status of optimization */
-  *INTEGER(solstat) = CPXgetstat(env, lp);
-
-  if (isMIP) {
-    /* MIP optimization */
-    /* in MIPs it is possible to have more than 1 solution
-       FIXME: use solution pool by default? */
-    
-    cur_numrows = CPXgetnumrows(env, lp);
-    cur_numcols = CPXgetnumcols(env, lp);
-
-    /* if the 'n' parameter is set in R we always return a list with
-       a number of elements equal to the number of solutions */
-    if(INTEGER(num_poplim)[0] > 1){
-      /*
-       * solution pools not supported until cplex 11.0
-       */
-#if CPX_VERSION >= 1100 
-      num_sol = CPXgetsolnpoolnumsolns(env, lp);
-
-      /* MIP optimization results -> more than 1 solution */
-      PROTECT(res = allocVector(VECSXP, num_sol));
-
-      /* now retrieve multiple solutions if any */
-      for( i = 0; i < num_sol; i++){
-	PROTECT(tmp  = allocVector(VECSXP, 4));
-    	PROTECT(xopt = allocVector(REALSXP,  cur_numcols));
-    	status = CPXgetsolnpoolx(env, lp, i, REAL(xopt), 0, cur_numcols - 1);
-    	
-	SET_VECTOR_ELT(tmp, 0, xopt);
-
-	PROTECT(obj   = allocVector(REALSXP, 1));
-	status = CPXgetsolnpoolobjval(env, lp, i, REAL(obj));
-	/* if no integer solution exists, return NA */
-	if (status) {
-	  *REAL(obj) = NA_REAL;
-	}
-	SET_VECTOR_ELT(tmp, 1, obj);
-	
-	/* extra info */
-    	PROTECT(slack  = allocVector(REALSXP,  cur_numrows));
-    	status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1);
-    	PROTECT(nodecnt = allocVector(INTSXP, 1));
-	*INTEGER(nodecnt) = CPXgetnodecnt(env, lp);
-
-	SET_VECTOR_ELT(tmp, 2, solstat); 
-
-	PROTECT(extra   = allocVector(VECSXP,  2));
-	SET_VECTOR_ELT(extra, 0, nodecnt);
-	SET_VECTOR_ELT(extra, 1, slack);
-	SET_VECTOR_ELT(tmp, 3, extra);
-	
-	/* add solution to return vector */
-	SET_VECTOR_ELT(res, i, tmp);
-
-    	UNPROTECT(6);
-      } /* END FOR */
-#endif /* end #if CPX_VERSION >= 1100 */
-
-    } /* END multiple solutions */
-    else {
-      /* MIP optimization 1 solution */
-      PROTECT(res   = allocVector(VECSXP,  5));
-      PROTECT(obj   = allocVector(REALSXP, 1));
-      PROTECT(xopt  = allocVector(REALSXP, numcols));
-      PROTECT(epgap  = allocVector(REALSXP, 1)); /* added by Marcin */
-      PROTECT(extra = allocVector(VECSXP,  2));
-      PROTECT(slack = allocVector(REALSXP, numrows));
-
-      status = CPXgetmipobjval(env, lp, REAL(obj));
-      /* if no integer solution exists, return NA */
-      if (status) {
-        *REAL(obj) = NA_REAL;
-      }
-
-      status = CPXgetmipx(env, lp, REAL(xopt), 0, cur_numcols - 1);
-      if (status) {
-	for(i = 0; i < cur_numcols; i++)
-	  REAL(xopt)[i] = NA_REAL;
-      }
-      
-      status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1);
-      if (status) {
-	for(i = 0; i < cur_numrows; i++)
-	  REAL(slack)[i] = NA_REAL;
-      }
-
-      status = CPXgetmiprelgap(env, lp, REAL(epgap)); /* added by Marcin */
-
-      /* Provide some little extra information */
-      PROTECT(nodecnt = allocVector(INTSXP, 1));
-      *INTEGER(nodecnt) = CPXgetnodecnt(env, lp);
-      SET_VECTOR_ELT(extra, 0, nodecnt);
-      SET_VECTOR_ELT(extra, 1, slack);
-    
-      /* Create return vector for MIP 1 solution*/
-      SET_VECTOR_ELT(res, 0, xopt); 
-      SET_VECTOR_ELT(res, 1, obj); 
-      SET_VECTOR_ELT(res, 2, solstat); 
-      SET_VECTOR_ELT(res, 3, extra);
-      SET_VECTOR_ELT(res, 4, epgap); /* added by Marcin */
-      
-      UNPROTECT(6);
-    } /* END MIP optimization 1 solution */
-  } /* END MIP optimization */
-  else {
-    /* continuous optimization */
-    PROTECT(obj   = allocVector(REALSXP, 1));
-    PROTECT(xopt  = allocVector(REALSXP, numcols));
-    PROTECT(extra = allocVector(VECSXP,  2));
-    PROTECT(slack = allocVector(REALSXP, numrows));
-
-    status = CPXgetobjval(env, lp, REAL(obj));
-    if (status) {
-      *REAL(obj) = NA_REAL;
-    }
-
-    cur_numrows = CPXgetnumrows(env, lp);
-    cur_numcols = CPXgetnumcols(env, lp);
-    status = CPXgetx(env, lp, REAL(xopt), 0, cur_numcols - 1);
-    if (status) {
-      for(i = 0; i < cur_numcols; i++)
-	REAL(xopt)[i] = NA_REAL;
-    }
-
-    status = CPXgetslack(env, lp, REAL(slack), 0, cur_numrows - 1);
-    if (status) {
-      for(i = 0; i < cur_numrows; i++)
-	REAL(slack)[i] = NA_REAL;
-    }
-
-    /* Provide some little extra information */
-    PROTECT(lambda = allocVector(REALSXP, numrows));
-    status = CPXgetpi(env, lp, REAL(lambda), 0, cur_numrows - 1);
-    if (status) {
-      for(i = 0; i < cur_numrows; i++)
-	REAL(lambda)[i] = NA_REAL;
-    }
-    SET_VECTOR_ELT(extra, 0, lambda);
-    SET_VECTOR_ELT(extra, 1, slack);
-
-    /* Create return vector for continuous solution */
-    PROTECT(res = allocVector(VECSXP,4));
-    SET_VECTOR_ELT(res, 0, xopt); 
-    SET_VECTOR_ELT(res, 1, obj); 
-    SET_VECTOR_ELT(res, 2, solstat); 
-    SET_VECTOR_ELT(res, 3, extra);
-    
-    UNPROTECT(5);
-  } /* END IF continuous optimization */
-  
-
-  UNPROTECT(2); /* unprotect return-vector res and solstat */
-  
-  /* Reset all CPLEX parameters to default values */
-  status = CPXsetdefaults(env);
-  if (status) {
-    Rprintf("Status: %d", status);
-    my_error(("Failed to set parameters to default.\n"));
-  }
-  return(res);
-}
-
diff --git a/src/Rcplex2.h b/src/Rcplex2.h
deleted file mode 100644
index f155a60d..00000000
--- a/src/Rcplex2.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _RCPLEX_H
-#define _RCPLEX_H
-
-#include <ilcplex/cplex.h>
-#include <stdlib.h>
-#include <R.h>
-#include <Rinternals.h>
-#include <unistd.h>
-
-#define my_error(x) { forceCplxClose = 1; error x; }
-
-/* Global Variables */
-extern CPXENVptr env;
-extern CPXLPptr lp;
-extern int numcalls;
-extern int max_numcalls;
-extern int forceCplxClose;
-
-/* Function definitions */
-SEXP getListElement( SEXP, char* );
-SEXP setListElement( SEXP, char*, double);
-void setparams( CPXENVptr, SEXP, int, int );
-void setstarts( CPXENVptr, CPXLPptr, SEXP, int );
-void Rcplex_init( void );
-void Rcplex_close( void );
-void Rcplex_free( void );
-
-#endif
diff --git a/src/Rcplex_params.c b/src/Rcplex_params.c
deleted file mode 100644
index 45874ed6..00000000
--- a/src/Rcplex_params.c
+++ /dev/null
@@ -1,261 +0,0 @@
-#include "Rcplex2.h"
-
-void setparams(CPXENVptr env, SEXP control, int isQP, int isMIP) {
-  int i, status, value;
-  const char *cur_parm;
-  SEXP names;
-  
-  /* get list names */
-  PROTECT(names = getAttrib(control, R_NamesSymbol));
-
-  status = 1; /* avoid warning */
-
-  /* for each element in 'control' try to set the corresponding
-     parameter */
-  for (i = 0; i < length(control); i++) {
-
-    cur_parm = CHAR(STRING_ELT(names, i));
-    
-    /* trace - CPX_PARAM_SCRIND */
-    if(strcmp(cur_parm, "trace") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_SCRIND, 
-		  *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF);
-    }
-    /* method */
-    else if(strcmp(cur_parm, "method") == 0) {
-      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
-      case 0:
-	value = CPX_ALG_AUTOMATIC;
-	break;
-      case 1:
-	value = CPX_ALG_PRIMAL;
-	break;
-      case 2:
-	value = CPX_ALG_DUAL;
-	break;
-      case 3:
-	value = CPX_ALG_NET;
-	break;
-      case 4:
-	value = CPX_ALG_BARRIER;
-	break;
-      default:
-	warning("Unknown optimization method %d, using default\n", value);
-	value = CPX_ALG_AUTOMATIC;
-      }
-      /* Do we have a QP or a LP?*/
-      if (isQP)
-	status = CPXsetintparam(env, CPX_PARAM_QPMETHOD, value);
-      else
-	status = CPXsetintparam(env, CPX_PARAM_LPMETHOD, value);
-    }
-    else if(strcmp(cur_parm, "preind") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_PREIND,
-		  *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF);
-    }
-    else if(strcmp(cur_parm, "aggind") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_AGGIND,
-			      *INTEGER(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "itlim") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_ITLIM, 
-			      *INTEGER(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "epgap") == 0) {
-      status = CPXsetdblparam(env, CPX_PARAM_EPGAP,
-			      *REAL(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "epagap") == 0) {
-      status = CPXsetdblparam(env, CPX_PARAM_EPAGAP, 
-    			      *REAL(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm,"tilim") == 0) {
-      status = CPXsetdblparam(env, CPX_PARAM_TILIM, 
-			      *REAL(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "mipemphasis") == 0) {
-      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
-      case 0:
-	value = CPX_MIPEMPHASIS_BALANCED;
-	break;
-      case 1:
-	value = CPX_MIPEMPHASIS_FEASIBILITY;
-	break;
-      case 2:
-	value = CPX_MIPEMPHASIS_OPTIMALITY;
-	break;
-      case 3:
-	value = CPX_MIPEMPHASIS_BESTBOUND;
-	break;
-      case 4:
-	value = CPX_MIPEMPHASIS_HIDDENFEAS;
-	break;
-      default:
-	warning("Unknown mip emphasis setting %d, using default\n", value);
-	value = CPX_MIPEMPHASIS_BALANCED;
-      }
-      status = CPXsetintparam(env, CPX_PARAM_MIPEMPHASIS, value);
-    }
-    else if(strcmp(cur_parm, "disjcuts") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_DISJCUTS, 
-			      *INTEGER(VECTOR_ELT(control,i)));
-    }
-    else if(strcmp(cur_parm, "cliques") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_CLIQUES, 
-			      *INTEGER(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm,"nodesel") == 0) {
-      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
-      case 0:
-	value = CPX_NODESEL_DFS;
-	break;
-      case 1:
-	value = CPX_NODESEL_BESTBOUND;
-	break;
-      case 2:
-	value = CPX_NODESEL_BESTEST;
-	break;
-      case 3:
-	value = CPX_NODESEL_BESTEST_ALT;
-	break;
-      default:
-	warning("Unknown node selection strategy %d, using default\n", value);
-	value = CPX_NODESEL_BESTBOUND;
-      }
-      status = CPXsetintparam(env, CPX_PARAM_NODESEL, value);
-    }
-    else if(strcmp(cur_parm, "probe") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_PROBE,
-			      *INTEGER(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "varsel") == 0) {
-      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
-      case -1:
-	value = CPX_VARSEL_MININFEAS;
-	break;
-      case 0:
-	value = CPX_VARSEL_DEFAULT;
-	break;
-      case 1:
-	value = CPX_VARSEL_MAXINFEAS;
-	break;
-      case 2:
-	value = CPX_VARSEL_PSEUDO;
-	break;
-      case 3:
-	value = CPX_VARSEL_STRONG;
-	break;
-      case 4:
-	value = CPX_VARSEL_PSEUDOREDUCED;
-	break;
-      default:
-	warning("Unknown variable selection strategy %d, using default\n", 
-		value);
-	value = CPX_VARSEL_DEFAULT;
-      }
-      status = CPXsetintparam(env, CPX_PARAM_VARSEL, value);
-    }
-    else if(strcmp(cur_parm, "flowcovers") == 0) {
-      status = CPXsetintparam(env, CPX_PARAM_FLOWCOVERS,
-			      *INTEGER(VECTOR_ELT(control, i)));
-    }
-    else if(strcmp(cur_parm, "mipstart") == 0) {
-    }
-    else if(strcmp(cur_parm, "solnpoolagap") == 0){
-      /* solution pool parameters */
-      #if CPX_VERSION >= 1100 
-      status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLAGAP,
-      			      *REAL(VECTOR_ELT(control, i)));
-      #endif
-    }
-    else if(strcmp(cur_parm, "solnpoolgap") == 0){
-      /* solution pool parameters */
-      #if CPX_VERSION >= 1100
-      status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLGAP,
-			      *REAL(VECTOR_ELT(control, i)));
-      #endif
-    }
-    else if(strcmp(cur_parm, "solnpoolintensity") == 0){
-      #if CPX_VERSION >= 1100
-      status = CPXsetintparam(env, CPX_PARAM_SOLNPOOLINTENSITY, 
-    			      *INTEGER(VECTOR_ELT(control, i)));
-      #endif
-    }
-    else if (strcmp(cur_parm, "breaksymmetry") == 0){ /*ADDED BY XT*/
-      #if CPX_VERSION >= 1100
-      status = CPXsetintparam(env, CPX_PARAM_SYMMETRY,
-    			      *INTEGER(VECTOR_ELT(control, i)));
-      #endif
-    } else if (strcmp(cur_parm, "tuning.display") == 0){
-      #if CPX_VERSION >= 1100
-      status = CPXsetintparam(env, CPX_PARAM_TUNINGDISPLAY,
-    			      *INTEGER(VECTOR_ELT(control, i)));
-      #endif
-    } else if (strcmp(cur_parm, "tuning.rep") == 0){
-      #if CPX_VERSION >= 1100
-      status = CPXsetintparam(env, CPX_PARAM_TUNINGREPEAT,
-    			      *INTEGER(VECTOR_ELT(control, i)));
-      #endif
-    } else if (strcmp(cur_parm, "tuning.tilim") == 0){
-      #if CPX_VERSION >= 1100
-      status = CPXsetdblparam(env, CPX_PARAM_TUNINGTILIM,
-    			      *REAL(VECTOR_ELT(control, i)));
-      #endif
-    }
-    else {
-      /* If parameter not known print a warning */
-      warning("Unknown CPLEX parameter %s. Ignoring it.\n", cur_parm);
-    }
-
-    if (status)
-      my_error(("Failure to set parameter %s, error %d.\n", cur_parm, status));
-  }
-  
-  UNPROTECT(1);
-}
-
-/* ADDED BY MARCIN!!
- * 
- */
-void setstarts(CPXENVptr env, CPXLPptr lp, SEXP control, int isMIP) {
-  int i, status, value;
-  const char *cur_parm;
-  SEXP names;
-  
-  /* get list names */
-  PROTECT(names = getAttrib(control, R_NamesSymbol));
-
-  status = 1; /* avoid warning */
-
-  /* for each element in 'control' try to set the corresponding
-     parameter */
-  if (isMIP) {
-    for (i = 0; i < length(control); i++) {
-
-      cur_parm = CHAR(STRING_ELT(names, i));
-
-      if(strcmp(cur_parm, "mipstart") == 0) {
-        SEXP values_vec = VECTOR_ELT(control, i);
-        double* values = REAL(values_vec);
-        int mcnt = 1;
-        int beg = 0;
-        int nzcnt = LENGTH(values_vec);
-        SEXP tmp = allocVector(INTSXP, nzcnt);
-        int *varindices = INTEGER(tmp);
-        int effortlevel = CPX_MIPSTART_AUTO;
-        char *mipstartname = "start";
-
-        for (int i = 0; i < nzcnt; i++) {
-          varindices[i] = i;
-        }
-
-        status = CPXaddmipstarts(env, lp, mcnt, nzcnt, &beg, varindices, values, &effortlevel, &mipstartname);
-
-        if (status)
-          my_error(("Failure to set parameter %s in setstarts, error %d.\n", cur_parm, status));
-      }
-    }
-  }
-  
-  UNPROTECT(1);
-}
diff --git a/src/Rcplex_utils.c b/src/Rcplex_utils.c
deleted file mode 100644
index b71f1a91..00000000
--- a/src/Rcplex_utils.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// Utility functions like CPLEX initialization/closing routines and
-// list accessors
-
-#include "Rcplex2.h"
-#include <time.h>
-
-CPXENVptr env;
-CPXLPptr lp;
-int numcalls;
-int forceCplxClose;
-
-void Rcplex_wait (int seconds) {
-  clock_t endwait;
-  endwait = clock() + seconds * CLOCKS_PER_SEC;
-  while (clock() < endwait) {}
-}
-
-void Rcplex_init(void) {
-  int numtries = 10;
-  int status;
-  char errmsg[1024];
-
-  /* Initialize CPLEX environment */
-  if (env == NULL) {
-    env = CPXopenCPLEX (&status);
-    while(env == NULL && numtries > 0) {
-      Rcplex_wait(30);
-      numtries--;
-    }
-    if (env == NULL) {
-      CPXgeterrorstring(env, status, errmsg);
-      error("Could not open CPLEX environment.\n%s\n",errmsg);
-    }
-    REprintf("CPLEX environment opened\n");
-    numcalls = max_numcalls;
-  } 
-  else {
-    numcalls--;
-  }
-  forceCplxClose = 0;
-}
-
-void Rcplex_close(void) {
-  forceCplxClose = 1;
-  Rcplex_free();
-}
-
-void Rcplex_free(void) {
-  int status1, status2;
-  char errmsg[1024];
-
-  status1 = status2 = 0;
-  if (lp != NULL) {
-    status1 = CPXfreeprob(env,&lp);
-    /*    REprintf("Freed CPLEX problem\n");*/
-    lp = NULL;
-  }
-
-  if (env != NULL && (numcalls == 0 || forceCplxClose)) {
-    status2 = CPXcloseCPLEX(&env);
-    REprintf("Closed CPLEX environment\n");
-    env = NULL;
-  }
-
-  if (status1 || status2) {
-    status2 ? strcpy(errmsg,"env close ok") : CPXgeterrorstring(env,status2,errmsg);
-    error("Rcplex_free failed: free problem code: %d\nClose environment msg: %s\n",status1,errmsg);
-  }
-  return;
-}
-
-SEXP getListElement(SEXP list, char *str) {
-  SEXP element = R_NilValue, names = getAttrib(list, R_NamesSymbol);
-  int i;
-
-  for (i=0; i < length(list); i++) {
-    if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) {
-      element = VECTOR_ELT(list,i);
-      break;
-    }
-  }
-  return element;
-}
-
-SEXP setListElement(SEXP list, char* str, double value) {
-  /* SEXP element = R_NilValue, */
-  SEXP out = allocVector(VECSXP, length(list));
-  SEXP names = getAttrib(list, R_NamesSymbol);
-
-  setAttrib(out, R_NamesSymbol, names);
-  int i;
-
-  for (i=0; i < length(list); i++) {
-    if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) {
-      /* element = VECTOR_ELT(list,i); */
-      SET_VECTOR_ELT(out, i, ScalarReal(value));
-    } else {
-      SET_VECTOR_ELT(out, i, VECTOR_ELT(list, i));
-    }
-  }
-  return out;
-}
diff --git a/srcs/src.blank/Makevars b/srcs/src.blank/Makevars
deleted file mode 100644
index 912a1707..00000000
--- a/srcs/src.blank/Makevars
+++ /dev/null
@@ -1,3 +0,0 @@
-PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing
-PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include
-PKG_LIBS=-lm -lpthread
diff --git a/srcs/src.blank/Makevars.in b/srcs/src.blank/Makevars.in
deleted file mode 100644
index f03b4d39..00000000
--- a/srcs/src.blank/Makevars.in
+++ /dev/null
@@ -1,3 +0,0 @@
-PKG_CFLAGS=@PKG_CFLAGS@
-PKG_CPPFLAGS=@PKG_CPPFLAGS@
-PKG_LIBS=@PKG_LIBS@
diff --git a/srcs/src.blank/Makevars.win b/srcs/src.blank/Makevars.win
deleted file mode 100644
index 5c9fb6d9..00000000
--- a/srcs/src.blank/Makevars.win
+++ /dev/null
@@ -1,7 +0,0 @@
-ifeq "$(WIN)" "64"
-PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm
-PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include"
-else
-PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm
-PKG_CPPFLAGS = -I"${CPLEX_DIR}/include"
-endif
diff --git a/srcs/src.blank/Rcplex2.c b/srcs/src.blank/Rcplex2.c
deleted file mode 100644
index f649d71b..00000000
--- a/srcs/src.blank/Rcplex2.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// The actual solving procedure is called using the function Rcplex 
-#include "Rcplex2.h"
-
-SEXP Rcplex2(SEXP numcols_p,
-	    SEXP numrows_p,
-	    SEXP objsen_p,
-	    SEXP cvec,
-	    SEXP bvec,
-	    SEXP Amat,
-	    SEXP Qmat,
-	    SEXP lb_p,
-	    SEXP ub_p,
-	    SEXP Rsense,
-	    SEXP Rvtype,
-	    SEXP isQP_p,
-	    SEXP isMIP_p,
-	    SEXP num_poplim,
-	    SEXP control,
-	    SEXP maxcalls)
-{
-  return(numcols_p);
-}
-
diff --git a/srcs/src.blank/Rcplex2.h b/srcs/src.blank/Rcplex2.h
deleted file mode 100644
index 2062eb2e..00000000
--- a/srcs/src.blank/Rcplex2.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#include <stdlib.h>
-#include <R.h>
-#include <Rinternals.h>
-#include <unistd.h>
-
-
diff --git a/srcs/src.rcplex/Rcplex2.c b/srcs/src.rcplex/Rcplex2.c
index 0862d38a..83c922b3 100644
--- a/srcs/src.rcplex/Rcplex2.c
+++ b/srcs/src.rcplex/Rcplex2.c
@@ -1,6 +1,8 @@
 // The actual solving procedure is called using the function Rcplex 
 #include "Rcplex2.h"
 
+int max_numcalls;
+
 SEXP Rcplex2(SEXP numcols_p,
 	     SEXP numrows_p,
 	     SEXP objsen_p,
diff --git a/srcs/src.rcplex/Rcplex2.h b/srcs/src.rcplex/Rcplex2.h
index 33bf5aa6..f155a60d 100644
--- a/srcs/src.rcplex/Rcplex2.h
+++ b/srcs/src.rcplex/Rcplex2.h
@@ -10,11 +10,11 @@
 #define my_error(x) { forceCplxClose = 1; error x; }
 
 /* Global Variables */
-CPXENVptr env;
-CPXLPptr lp;
-int numcalls;
-int max_numcalls;
-int forceCplxClose;
+extern CPXENVptr env;
+extern CPXLPptr lp;
+extern int numcalls;
+extern int max_numcalls;
+extern int forceCplxClose;
 
 /* Function definitions */
 SEXP getListElement( SEXP, char* );
diff --git a/srcs/src.rcplex/Rcplex_utils.c b/srcs/src.rcplex/Rcplex_utils.c
index 4f15ed4c..b71f1a91 100644
--- a/srcs/src.rcplex/Rcplex_utils.c
+++ b/srcs/src.rcplex/Rcplex_utils.c
@@ -4,6 +4,11 @@
 #include "Rcplex2.h"
 #include <time.h>
 
+CPXENVptr env;
+CPXLPptr lp;
+int numcalls;
+int forceCplxClose;
+
 void Rcplex_wait (int seconds) {
   clock_t endwait;
   endwait = clock() + seconds * CLOCKS_PER_SEC;

From c7744ee994026007300580d7851ae0c8b4083832 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Fri, 30 Apr 2021 16:09:21 -0400
Subject: [PATCH 02/35] get rid of unused parameters

---
 R/apps.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 96d8fda8..1530b528 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -60,7 +60,6 @@
 #' @param verbose (integer)scalar specifying whether to do verbose output, value 2 will spit out MIP (1)
 #' @param tilim (numeric) time limit on MIP in seconds (10)
 #' @param epgap (numeric) relative optimality gap threshhold between 0 and 1 (default 1e-3)
-#' @param nsol (integer) number of solutions (default 1)
 #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE)
 #' 
 #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda.
@@ -82,7 +81,6 @@ balance = function(gg,
                    verbose = 1,
                    tilim = 10,
                    epgap = 1e-3,
-                   nsol = 1,
                    debug = FALSE)
 {
     if (verbose) {

From a25e97ebdec661e8c0f88453fdc4f5326b06a3c5 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sat, 1 May 2021 19:08:18 -0400
Subject: [PATCH 03/35] initial commit apps.R

---
 R/apps.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/apps.R b/R/apps.R
index 1530b528..7d7daac7 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -61,6 +61,7 @@
 #' @param tilim (numeric) time limit on MIP in seconds (10)
 #' @param epgap (numeric) relative optimality gap threshhold between 0 and 1 (default 1e-3)
 #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE)
+#' @param gurobi (logical) use gurobi if TRUE uses gurobi else CPLEX default FALSE
 #' 
 #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda.
 #' @author Marcin Imielinski
@@ -81,7 +82,8 @@ balance = function(gg,
                    verbose = 1,
                    tilim = 10,
                    epgap = 1e-3,
-                   debug = FALSE)
+                   debug = FALSE,
+                   gurobi = FALSE)
 {
     if (verbose) {
         message("creating copy of input gGraph")

From f582cd40f3ff9af5f4f7d639e80629c1398a573f Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sun, 2 May 2021 18:51:03 -0400
Subject: [PATCH 04/35] initial commit gurobi wrapper

---
 R/utils.R | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/R/utils.R b/R/utils.R
index 8d63ab00..0d824287 100755
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,6 +1,68 @@
 ## appease R CMD check vs data.table 
 sid=side1=side2=side_1=side_2=silent=snid=splice.variant=splicevar=str1=str2=strand_1=strand_2=subject.id=suffix=tag=threep.cc=threep.coord=threep.exon=threep.frame=threep.pc=threep.sc=threep.sc.frame=to=transcript.id.x=transcript.id.y=transcript_associated=transcript_id=twidth=tx.cc=tx.ec=tx_strand=tx_strand.x=tx_strand.y=txid=type=uid=uids=val=walk.id=walk.iid=walk.iid.x=walk.iid.y=wkid=NULL
 
+#' @name run.gurobi
+#' @title run.gurobi
+#'
+#' @description
+#' wrapper to run gurobi with CPLEX-like function call for easy switching bw optimizers
+#'
+#' @param cvec (numeric)
+#' @param Amat (sparse matrix)
+#' @param bvec (numeric)
+#' @param Qmat (sparse matrix)
+#' @param lb (numeric)
+#' @param ub (numeric)
+#' @param sense (character)
+#' @param vtype (variable type)
+#' @param objsense
+#' @param control
+#' 
+#' @return sol - list with names $x, $epgap, $status
+run.gurobi = function(cvec = NULL,
+                      Amat = NULL,
+                      bvec = NULL
+                      Qmat = NULL,
+                      lb = NULL,
+                      ub = NULL,
+                      sense = NULL,
+                      vtype = NULL,
+                      objsense = NULL,
+                      control = NULL) {
+
+    ## build model
+    model = list(
+        obj = cvec,
+        A = Amat,
+        rhs = bvec,
+        Q = Qmat,
+        lb = lb,
+        ub = ub,
+        vtype = vtype,
+        sense = c("E"="=", "G"=">", "L"="<")[sense] ## inequalities are leq, geq (e.g. not strict)
+        modelsense = objsense)
+
+    ## params
+    params = list()
+    if (!is.null(control$epgap)) {
+        params$MIPGap = epgap
+    }
+    if (!is.null(control$tilim)) {
+        params$TimeLimit = tilim
+    }
+
+    ## TODO: set up env list for running on compute cluster
+
+    ## run gurobi
+    sol = gurobi::gurobi(model = model, params = params)
+
+    ## make solution consistent with Rcplex output
+    ## but return all the things
+    sol$epgap = sol$mipgap
+
+    return(sol)
+}
+
 
 #' @name duplicated.matrix
 #' @title R-3.5.1 version of duplicated.matrix

From c1360765a1a102012296eeb232d974093435b333 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sun, 2 May 2021 21:30:03 -0400
Subject: [PATCH 05/35] expose threads parameter

---
 R/utils.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 0d824287..75e7cc0b 100755
--- a/R/utils.R
+++ b/R/utils.R
@@ -15,20 +15,24 @@ sid=side1=side2=side_1=side_2=silent=snid=splice.variant=splicevar=str1=str2=str
 #' @param ub (numeric)
 #' @param sense (character)
 #' @param vtype (variable type)
-#' @param objsense
-#' @param control
+#' @param objsense (character) default min
+#' @param control (list) should have epgap, tilim, trace, ideally
+#' @param threads (numeric) 
 #' 
 #' @return sol - list with names $x, $epgap, $status
 run.gurobi = function(cvec = NULL,
                       Amat = NULL,
-                      bvec = NULL
+                      bvec = NULL,
                       Qmat = NULL,
                       lb = NULL,
                       ub = NULL,
                       sense = NULL,
                       vtype = NULL,
-                      objsense = NULL,
-                      control = NULL) {
+                      objsense = 'min',
+                      control = list(epgap = 1e-2, tilim = 360, trace = 2),
+                      threads = 32) {
+
+    browser()
 
     ## build model
     model = list(
@@ -39,7 +43,7 @@ run.gurobi = function(cvec = NULL,
         lb = lb,
         ub = ub,
         vtype = vtype,
-        sense = c("E"="=", "G"=">", "L"="<")[sense] ## inequalities are leq, geq (e.g. not strict)
+        sense = c("E"="=", "G"=">", "L"="<")[sense], ## inequalities are leq, geq (e.g. not strict)
         modelsense = objsense)
 
     ## params
@@ -50,6 +54,9 @@ run.gurobi = function(cvec = NULL,
     if (!is.null(control$tilim)) {
         params$TimeLimit = tilim
     }
+    if (!is.null(control$trace)) {
+        params$LogToConsole = ifelse(control$trace > 0, 1, 0)
+    }
 
     ## TODO: set up env list for running on compute cluster
 

From a9313a53fc3c4aebf69c0c9c143e413ae6bd3b28 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sun, 2 May 2021 21:45:47 -0400
Subject: [PATCH 06/35] add gurobi option to balance

---
 R/apps.R | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 7d7daac7..08aca630 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -1302,18 +1302,33 @@ balance = function(gg,
   control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1)
   ## sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control)
 
-    ## call our wrapper for CPLEX
-    sol =  Rcplex2(cvec,
-                   Amat,
-                   bvec,
-                   Qmat = Qmat,
-                   lb = lb,
-                   ub = ub,
-                   sense = sense,
-                   vtype = vars$vtype,
-                   objsense = "min",
-                   control = control,
-                   tuning = FALSE)
+    if (gurobi) {
+        sol = run.gurobi(
+            cvec = cvec,
+            Amat = Amat,
+            bvec = bvec,
+            Qmat = Qmat,
+            lb = lb,
+            ub = ub,
+            sense = sense,
+            vtype = vars$vtype,
+            objsense = 'min',
+            control = control,
+            threads = 32)
+    } else {
+        ## call our wrapper for CPLEX
+        sol =  Rcplex2(cvec,
+                       Amat,
+                       bvec,
+                       Qmat = Qmat,
+                       lb = lb,
+                       ub = ub,
+                       sense = sense,
+                       vtype = vars$vtype,
+                       objsense = "min",
+                       control = control,
+                       tuning = FALSE)
+    }
     
   vars$cvec = cvec
   vars$x = sol$x

From ce35a7d32473bfda96f83fc686c9732ba33c37ae Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sun, 2 May 2021 21:46:00 -0400
Subject: [PATCH 07/35] bug fix in setting up parameters

---
 R/utils.R | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 75e7cc0b..96da4ce4 100755
--- a/R/utils.R
+++ b/R/utils.R
@@ -32,8 +32,6 @@ run.gurobi = function(cvec = NULL,
                       control = list(epgap = 1e-2, tilim = 360, trace = 2),
                       threads = 32) {
 
-    browser()
-
     ## build model
     model = list(
         obj = cvec,
@@ -49,10 +47,10 @@ run.gurobi = function(cvec = NULL,
     ## params
     params = list()
     if (!is.null(control$epgap)) {
-        params$MIPGap = epgap
+        params$MIPGap = control$epgap
     }
     if (!is.null(control$tilim)) {
-        params$TimeLimit = tilim
+        params$TimeLimit = control$tilim
     }
     if (!is.null(control$trace)) {
         params$LogToConsole = ifelse(control$trace > 0, 1, 0)

From c3351550a3e22e312f57e5150de6d1cd1ab3aee2 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 3 May 2021 14:54:09 -0400
Subject: [PATCH 08/35] UDKJF

---
 .gitignore          |   1 -
 src/Makevars        |   3 +
 src/Makevars.win    |   7 +
 src/Rcplex2.c       | 375 ++++++++++++++++++++++++++++++++++++++++++++
 src/Rcplex2.h       |  28 ++++
 src/Rcplex_params.c | 261 ++++++++++++++++++++++++++++++
 src/Rcplex_utils.c  | 102 ++++++++++++
 7 files changed, 776 insertions(+), 1 deletion(-)
 create mode 100644 src/Makevars
 create mode 100644 src/Makevars.win
 create mode 100644 src/Rcplex2.c
 create mode 100644 src/Rcplex2.h
 create mode 100644 src/Rcplex_params.c
 create mode 100644 src/Rcplex_utils.c

diff --git a/.gitignore b/.gitignore
index 05fc351c..eb8b3afc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,7 +71,6 @@ docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.RData
 docs/tutorial_cache/html/unnamed-chunk-3_a81070afea2cd322305d016d8dbace33.rdb
 docs/debug.html
 docs/tutorial_2_files/figure-html/duplications and pyrgo-1.png
-src/
 tests/testthat/circos.pdf
 tests/testthat/test.json
 tests/testthat/test.pdf
diff --git a/src/Makevars b/src/Makevars
new file mode 100644
index 00000000..9d19401f
--- /dev/null
+++ b/src/Makevars
@@ -0,0 +1,3 @@
+PKG_CFLAGS=-m64 -fPIC -fno-strict-aliasing
+PKG_CPPFLAGS=-I${CPLEX_DIR}/cplex/include
+PKG_LIBS=-L${CPLEX_DIR}/cplex/lib/x86-64_linux/static_pic -lcplex -lm -lpthread
diff --git a/src/Makevars.win b/src/Makevars.win
new file mode 100644
index 00000000..5c9fb6d9
--- /dev/null
+++ b/src/Makevars.win
@@ -0,0 +1,7 @@
+ifeq "$(WIN)" "64"
+PKG_LIBS = -L"${CPLEX_DIR}/bin/x64_win64" -lcplex1263 -lm
+PKG_CPPFLAGS = -D_LP64 -I"${CPLEX_DIR}/include"
+else
+PKG_LIBS = -L"${CPLEX_DIR}/bin/x86_win32" -lcplex1263 -lm
+PKG_CPPFLAGS = -I"${CPLEX_DIR}/include"
+endif
diff --git a/src/Rcplex2.c b/src/Rcplex2.c
new file mode 100644
index 00000000..83c922b3
--- /dev/null
+++ b/src/Rcplex2.c
@@ -0,0 +1,375 @@
+// The actual solving procedure is called using the function Rcplex 
+#include "Rcplex2.h"
+
+int max_numcalls;
+
+SEXP Rcplex2(SEXP numcols_p,
+	     SEXP numrows_p,
+	     SEXP objsen_p,
+	     SEXP cvec,
+	     SEXP bvec,
+	     SEXP Amat,
+	     SEXP Qmat,
+	     SEXP lb_p,
+	     SEXP ub_p,
+	     SEXP Rsense,
+	     SEXP Rvtype,
+	     SEXP isQP_p,
+	     SEXP isMIP_p,
+	     SEXP num_poplim,
+	     SEXP control,
+	     SEXP maxcalls,
+	     SEXP tuning)
+{
+  char *probname = "Rcplex";
+  int numcols    = INTEGER(numcols_p)[0]; 
+  int numrows    = INTEGER(numrows_p)[0];
+  int objsen     = INTEGER(objsen_p)[0];
+  double *lb     = REAL(lb_p); 
+  double *ub     = REAL(ub_p);
+
+  char sense[numrows];
+  char vtype[numcols];
+  //  double dj[numrows];
+  int isQP       = INTEGER(isQP_p)[0];
+  int isMIP      = INTEGER(isMIP_p)[0];
+  int trace      = INTEGER(getListElement(control,"trace"))[0];
+
+  SEXP res = NULL; /* set to avoid uninitialized warning */
+  SEXP xopt;
+  SEXP epgap;
+  SEXP obj;
+  SEXP solstat;
+  SEXP extra;
+  SEXP lambda;
+  SEXP slack;
+  SEXP nodecnt;
+
+  int status;
+  int i, j;
+  int cur_numrows, cur_numcols;
+
+  int tstat;
+  /*
+   * solution pools not supprted until cplex 11.0
+   */
+#if CPX_VERSION >= 1100 
+  int num_sol = 1;
+  SEXP tmp;
+#endif
+
+  /* set maxnumcalls before init */
+  max_numcalls = INTEGER(maxcalls)[0];
+  /* initialize CPLEX environment */
+  Rcplex_init();
+
+  if(trace) Rprintf("Rcplex: num variables=%d num constraints=%d\n",numcols,numrows);
+
+  /*
+   * solution pools not supported until cplex 11.0
+   */
+#if CPX_VERSION < 1100
+  if (INTEGER(num_poplim)[0] > 1) {
+    warning("Multiple solutions not supported in CPLEX version");
+    INTEGER(num_poplim)[0] = 1;
+  }
+#endif 
+
+  /* lb and ub */
+  for (j = 0; j < numcols; ++j) {
+    lb[j] = R_finite(lb[j]) ? lb[j] : -CPX_INFBOUND;
+    ub[j] = R_finite(ub[j]) ? ub[j] : CPX_INFBOUND;
+  }
+
+  /* set constraint inequality directions */
+  for (i = 0; i < numrows; ++i) {
+    sense[i] = CHAR(STRING_ELT(Rsense, i))[0];
+  }
+
+  /* set variable types */
+  if (isMIP) {
+    for (i = 0; i < numcols; ++i) {
+      vtype[i] = CHAR(STRING_ELT(Rvtype, i))[0];
+    }
+  }
+
+  /* set parameters given in control list */
+  setparams(env, control, isQP, isMIP);
+  lp = CPXcreateprob(env, &status, probname);
+
+
+  /* check memory problems */
+  if (lp == NULL) {
+    my_error(("Failed to create LP.\n"));
+  }
+
+  /* copy problem data into lp */
+  status = CPXcopylp(env, lp, numcols, numrows, objsen,REAL(cvec),
+		     REAL(bvec), sense,
+		     INTEGER(VECTOR_ELT(Amat, 0)), 
+		     INTEGER(VECTOR_ELT(Amat, 1)),
+		     INTEGER(VECTOR_ELT(Amat, 2)),
+		     REAL(VECTOR_ELT(Amat, 3)),
+		     lb, ub, NULL);
+  if (status) {
+    my_error(("Failed to copy problem data.\n"));
+  }
+
+  if (isQP) {
+    status = CPXcopyquad(env, lp,
+			 INTEGER(VECTOR_ELT(Qmat, 0)),
+			 INTEGER(VECTOR_ELT(Qmat, 1)),
+			 INTEGER(VECTOR_ELT(Qmat, 2)),
+			 REAL(VECTOR_ELT(Qmat, 3)));
+    if (status) {
+      my_error(("Failed to copy quadratic term of problem data.\n"));
+    }
+  }
+
+  if (isMIP) {
+    status = CPXcopyctype(env, lp, vtype);
+    if (status) {
+      my_error(("Failed to copy vtype.\n"));
+    }
+  }
+
+  /* solve problem */
+  if(isMIP) {
+
+    /* MARCIN ADDED set starts */
+    setstarts(env, lp, control, isMIP);
+
+    int do_tuning = asLogical(tuning);
+    if (do_tuning){
+      /* set the tuning total tilim to hard-setted value 600s */
+      SEXP tuning_control = setListElement(control, "tilim", 600);
+      /* temporarily change the tilim*/
+      setparams(env, tuning_control, isQP, isMIP);
+      
+      /* tune the hidden parameters */
+      status = CPXtuneparam(env, lp,
+			    0, NULL, NULL, // zero fixed int par
+			    0, NULL, NULL,
+			    0, NULL, NULL, &tstat);
+      if (status) {
+	my_error(("Failed to tune params."));
+      }
+      /* Write the optimized parameters to file */
+      status = CPXwriteparam (env, "tuned.prm");
+
+      setparams(env, control, isQP, isMIP);
+      /* MARCIN ADDED set starts */
+      setstarts(env, lp, control, isMIP);
+    } else {
+      /* Write the optimized parameters to file */
+      status = CPXwriteparam (env, "raw.prm");
+    }
+
+    status = CPXmipopt(env, lp);
+    
+    /*
+     * solutions pool not supported for versions of cplex < 11
+     * e.g., CPX_PARAM_POPULATELIM is not defined
+     */
+
+#if CPX_VERSION >= 1100
+    if(INTEGER(num_poplim)[0] > 1){
+      /* in MIPs it is possible to have more than 1 solution. If
+	 num_poplim > 1 we now try to find these solutions.
+	 For populating the solution pool a couple of parameters are
+	 relevant:
+	 1. the 'absolute gap' solution pool parameter
+	 CPX_PARAM_SOLNPOOLAGAP (see Rcplex C control parameters), 
+	 2. the 'solution pool intensity' parameter
+	 CPX_PARAM_SOLNPOOLINTENSITY (see Rcplex C control parameters),
+	 3. the 'limit on number of solutions generated' for solution
+	 pool CPX_PARAM_POPULATELIM */
+      status = CPXsetintparam(env, CPX_PARAM_POPULATELIM, 
+			      INTEGER(num_poplim)[0] - 1);
+      if (status) {
+	my_error(("Failed to set 'populate limit' parameter.\n"));
+      }
+      /* now populate the solutions pool */
+      status = CPXpopulate(env, lp);
+    }
+#endif
+  }
+  else if (isQP) {
+    status = CPXqpopt(env, lp);
+  }
+  else {
+    status = CPXlpopt(env, lp);
+  }
+  
+  /* a status value of zero does not necessarily mean that an optimal
+     solution exists. Examples of an exit status of non-zero here
+     include CPXERR_NO_PROBLEM, CPXERR_NO_MEMORY, ... */
+  if (status) {
+    my_error(("Failed to optimize problem."));
+  }
+  
+  PROTECT(solstat = allocVector(INTSXP,  1));
+
+  /* retrieve status of optimization */
+  *INTEGER(solstat) = CPXgetstat(env, lp);
+
+  if (isMIP) {
+    /* MIP optimization */
+    /* in MIPs it is possible to have more than 1 solution
+       FIXME: use solution pool by default? */
+    
+    cur_numrows = CPXgetnumrows(env, lp);
+    cur_numcols = CPXgetnumcols(env, lp);
+
+    /* if the 'n' parameter is set in R we always return a list with
+       a number of elements equal to the number of solutions */
+    if(INTEGER(num_poplim)[0] > 1){
+      /*
+       * solution pools not supported until cplex 11.0
+       */
+#if CPX_VERSION >= 1100 
+      num_sol = CPXgetsolnpoolnumsolns(env, lp);
+
+      /* MIP optimization results -> more than 1 solution */
+      PROTECT(res = allocVector(VECSXP, num_sol));
+
+      /* now retrieve multiple solutions if any */
+      for( i = 0; i < num_sol; i++){
+	PROTECT(tmp  = allocVector(VECSXP, 4));
+    	PROTECT(xopt = allocVector(REALSXP,  cur_numcols));
+    	status = CPXgetsolnpoolx(env, lp, i, REAL(xopt), 0, cur_numcols - 1);
+    	
+	SET_VECTOR_ELT(tmp, 0, xopt);
+
+	PROTECT(obj   = allocVector(REALSXP, 1));
+	status = CPXgetsolnpoolobjval(env, lp, i, REAL(obj));
+	/* if no integer solution exists, return NA */
+	if (status) {
+	  *REAL(obj) = NA_REAL;
+	}
+	SET_VECTOR_ELT(tmp, 1, obj);
+	
+	/* extra info */
+    	PROTECT(slack  = allocVector(REALSXP,  cur_numrows));
+    	status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1);
+    	PROTECT(nodecnt = allocVector(INTSXP, 1));
+	*INTEGER(nodecnt) = CPXgetnodecnt(env, lp);
+
+	SET_VECTOR_ELT(tmp, 2, solstat); 
+
+	PROTECT(extra   = allocVector(VECSXP,  2));
+	SET_VECTOR_ELT(extra, 0, nodecnt);
+	SET_VECTOR_ELT(extra, 1, slack);
+	SET_VECTOR_ELT(tmp, 3, extra);
+	
+	/* add solution to return vector */
+	SET_VECTOR_ELT(res, i, tmp);
+
+    	UNPROTECT(6);
+      } /* END FOR */
+#endif /* end #if CPX_VERSION >= 1100 */
+
+    } /* END multiple solutions */
+    else {
+      /* MIP optimization 1 solution */
+      PROTECT(res   = allocVector(VECSXP,  5));
+      PROTECT(obj   = allocVector(REALSXP, 1));
+      PROTECT(xopt  = allocVector(REALSXP, numcols));
+      PROTECT(epgap  = allocVector(REALSXP, 1)); /* added by Marcin */
+      PROTECT(extra = allocVector(VECSXP,  2));
+      PROTECT(slack = allocVector(REALSXP, numrows));
+
+      status = CPXgetmipobjval(env, lp, REAL(obj));
+      /* if no integer solution exists, return NA */
+      if (status) {
+        *REAL(obj) = NA_REAL;
+      }
+
+      status = CPXgetmipx(env, lp, REAL(xopt), 0, cur_numcols - 1);
+      if (status) {
+	for(i = 0; i < cur_numcols; i++)
+	  REAL(xopt)[i] = NA_REAL;
+      }
+      
+      status = CPXgetmipslack(env, lp, REAL(slack), 0, cur_numrows - 1);
+      if (status) {
+	for(i = 0; i < cur_numrows; i++)
+	  REAL(slack)[i] = NA_REAL;
+      }
+
+      status = CPXgetmiprelgap(env, lp, REAL(epgap)); /* added by Marcin */
+
+      /* Provide some little extra information */
+      PROTECT(nodecnt = allocVector(INTSXP, 1));
+      *INTEGER(nodecnt) = CPXgetnodecnt(env, lp);
+      SET_VECTOR_ELT(extra, 0, nodecnt);
+      SET_VECTOR_ELT(extra, 1, slack);
+    
+      /* Create return vector for MIP 1 solution*/
+      SET_VECTOR_ELT(res, 0, xopt); 
+      SET_VECTOR_ELT(res, 1, obj); 
+      SET_VECTOR_ELT(res, 2, solstat); 
+      SET_VECTOR_ELT(res, 3, extra);
+      SET_VECTOR_ELT(res, 4, epgap); /* added by Marcin */
+      
+      UNPROTECT(6);
+    } /* END MIP optimization 1 solution */
+  } /* END MIP optimization */
+  else {
+    /* continuous optimization */
+    PROTECT(obj   = allocVector(REALSXP, 1));
+    PROTECT(xopt  = allocVector(REALSXP, numcols));
+    PROTECT(extra = allocVector(VECSXP,  2));
+    PROTECT(slack = allocVector(REALSXP, numrows));
+
+    status = CPXgetobjval(env, lp, REAL(obj));
+    if (status) {
+      *REAL(obj) = NA_REAL;
+    }
+
+    cur_numrows = CPXgetnumrows(env, lp);
+    cur_numcols = CPXgetnumcols(env, lp);
+    status = CPXgetx(env, lp, REAL(xopt), 0, cur_numcols - 1);
+    if (status) {
+      for(i = 0; i < cur_numcols; i++)
+	REAL(xopt)[i] = NA_REAL;
+    }
+
+    status = CPXgetslack(env, lp, REAL(slack), 0, cur_numrows - 1);
+    if (status) {
+      for(i = 0; i < cur_numrows; i++)
+	REAL(slack)[i] = NA_REAL;
+    }
+
+    /* Provide some little extra information */
+    PROTECT(lambda = allocVector(REALSXP, numrows));
+    status = CPXgetpi(env, lp, REAL(lambda), 0, cur_numrows - 1);
+    if (status) {
+      for(i = 0; i < cur_numrows; i++)
+	REAL(lambda)[i] = NA_REAL;
+    }
+    SET_VECTOR_ELT(extra, 0, lambda);
+    SET_VECTOR_ELT(extra, 1, slack);
+
+    /* Create return vector for continuous solution */
+    PROTECT(res = allocVector(VECSXP,4));
+    SET_VECTOR_ELT(res, 0, xopt); 
+    SET_VECTOR_ELT(res, 1, obj); 
+    SET_VECTOR_ELT(res, 2, solstat); 
+    SET_VECTOR_ELT(res, 3, extra);
+    
+    UNPROTECT(5);
+  } /* END IF continuous optimization */
+  
+
+  UNPROTECT(2); /* unprotect return-vector res and solstat */
+  
+  /* Reset all CPLEX parameters to default values */
+  status = CPXsetdefaults(env);
+  if (status) {
+    Rprintf("Status: %d", status);
+    my_error(("Failed to set parameters to default.\n"));
+  }
+  return(res);
+}
+
diff --git a/src/Rcplex2.h b/src/Rcplex2.h
new file mode 100644
index 00000000..f155a60d
--- /dev/null
+++ b/src/Rcplex2.h
@@ -0,0 +1,28 @@
+#ifndef _RCPLEX_H
+#define _RCPLEX_H
+
+#include <ilcplex/cplex.h>
+#include <stdlib.h>
+#include <R.h>
+#include <Rinternals.h>
+#include <unistd.h>
+
+#define my_error(x) { forceCplxClose = 1; error x; }
+
+/* Global Variables */
+extern CPXENVptr env;
+extern CPXLPptr lp;
+extern int numcalls;
+extern int max_numcalls;
+extern int forceCplxClose;
+
+/* Function definitions */
+SEXP getListElement( SEXP, char* );
+SEXP setListElement( SEXP, char*, double);
+void setparams( CPXENVptr, SEXP, int, int );
+void setstarts( CPXENVptr, CPXLPptr, SEXP, int );
+void Rcplex_init( void );
+void Rcplex_close( void );
+void Rcplex_free( void );
+
+#endif
diff --git a/src/Rcplex_params.c b/src/Rcplex_params.c
new file mode 100644
index 00000000..45874ed6
--- /dev/null
+++ b/src/Rcplex_params.c
@@ -0,0 +1,261 @@
+#include "Rcplex2.h"
+
+void setparams(CPXENVptr env, SEXP control, int isQP, int isMIP) {
+  int i, status, value;
+  const char *cur_parm;
+  SEXP names;
+  
+  /* get list names */
+  PROTECT(names = getAttrib(control, R_NamesSymbol));
+
+  status = 1; /* avoid warning */
+
+  /* for each element in 'control' try to set the corresponding
+     parameter */
+  for (i = 0; i < length(control); i++) {
+
+    cur_parm = CHAR(STRING_ELT(names, i));
+    
+    /* trace - CPX_PARAM_SCRIND */
+    if(strcmp(cur_parm, "trace") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_SCRIND, 
+		  *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF);
+    }
+    /* method */
+    else if(strcmp(cur_parm, "method") == 0) {
+      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
+      case 0:
+	value = CPX_ALG_AUTOMATIC;
+	break;
+      case 1:
+	value = CPX_ALG_PRIMAL;
+	break;
+      case 2:
+	value = CPX_ALG_DUAL;
+	break;
+      case 3:
+	value = CPX_ALG_NET;
+	break;
+      case 4:
+	value = CPX_ALG_BARRIER;
+	break;
+      default:
+	warning("Unknown optimization method %d, using default\n", value);
+	value = CPX_ALG_AUTOMATIC;
+      }
+      /* Do we have a QP or a LP?*/
+      if (isQP)
+	status = CPXsetintparam(env, CPX_PARAM_QPMETHOD, value);
+      else
+	status = CPXsetintparam(env, CPX_PARAM_LPMETHOD, value);
+    }
+    else if(strcmp(cur_parm, "preind") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_PREIND,
+		  *INTEGER(VECTOR_ELT(control, i)) ? CPX_ON : CPX_OFF);
+    }
+    else if(strcmp(cur_parm, "aggind") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_AGGIND,
+			      *INTEGER(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "itlim") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_ITLIM, 
+			      *INTEGER(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "epgap") == 0) {
+      status = CPXsetdblparam(env, CPX_PARAM_EPGAP,
+			      *REAL(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "epagap") == 0) {
+      status = CPXsetdblparam(env, CPX_PARAM_EPAGAP, 
+    			      *REAL(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm,"tilim") == 0) {
+      status = CPXsetdblparam(env, CPX_PARAM_TILIM, 
+			      *REAL(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "mipemphasis") == 0) {
+      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
+      case 0:
+	value = CPX_MIPEMPHASIS_BALANCED;
+	break;
+      case 1:
+	value = CPX_MIPEMPHASIS_FEASIBILITY;
+	break;
+      case 2:
+	value = CPX_MIPEMPHASIS_OPTIMALITY;
+	break;
+      case 3:
+	value = CPX_MIPEMPHASIS_BESTBOUND;
+	break;
+      case 4:
+	value = CPX_MIPEMPHASIS_HIDDENFEAS;
+	break;
+      default:
+	warning("Unknown mip emphasis setting %d, using default\n", value);
+	value = CPX_MIPEMPHASIS_BALANCED;
+      }
+      status = CPXsetintparam(env, CPX_PARAM_MIPEMPHASIS, value);
+    }
+    else if(strcmp(cur_parm, "disjcuts") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_DISJCUTS, 
+			      *INTEGER(VECTOR_ELT(control,i)));
+    }
+    else if(strcmp(cur_parm, "cliques") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_CLIQUES, 
+			      *INTEGER(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm,"nodesel") == 0) {
+      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
+      case 0:
+	value = CPX_NODESEL_DFS;
+	break;
+      case 1:
+	value = CPX_NODESEL_BESTBOUND;
+	break;
+      case 2:
+	value = CPX_NODESEL_BESTEST;
+	break;
+      case 3:
+	value = CPX_NODESEL_BESTEST_ALT;
+	break;
+      default:
+	warning("Unknown node selection strategy %d, using default\n", value);
+	value = CPX_NODESEL_BESTBOUND;
+      }
+      status = CPXsetintparam(env, CPX_PARAM_NODESEL, value);
+    }
+    else if(strcmp(cur_parm, "probe") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_PROBE,
+			      *INTEGER(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "varsel") == 0) {
+      switch((value = *INTEGER(VECTOR_ELT(control, i)))) {
+      case -1:
+	value = CPX_VARSEL_MININFEAS;
+	break;
+      case 0:
+	value = CPX_VARSEL_DEFAULT;
+	break;
+      case 1:
+	value = CPX_VARSEL_MAXINFEAS;
+	break;
+      case 2:
+	value = CPX_VARSEL_PSEUDO;
+	break;
+      case 3:
+	value = CPX_VARSEL_STRONG;
+	break;
+      case 4:
+	value = CPX_VARSEL_PSEUDOREDUCED;
+	break;
+      default:
+	warning("Unknown variable selection strategy %d, using default\n", 
+		value);
+	value = CPX_VARSEL_DEFAULT;
+      }
+      status = CPXsetintparam(env, CPX_PARAM_VARSEL, value);
+    }
+    else if(strcmp(cur_parm, "flowcovers") == 0) {
+      status = CPXsetintparam(env, CPX_PARAM_FLOWCOVERS,
+			      *INTEGER(VECTOR_ELT(control, i)));
+    }
+    else if(strcmp(cur_parm, "mipstart") == 0) {
+    }
+    else if(strcmp(cur_parm, "solnpoolagap") == 0){
+      /* solution pool parameters */
+      #if CPX_VERSION >= 1100 
+      status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLAGAP,
+      			      *REAL(VECTOR_ELT(control, i)));
+      #endif
+    }
+    else if(strcmp(cur_parm, "solnpoolgap") == 0){
+      /* solution pool parameters */
+      #if CPX_VERSION >= 1100
+      status = CPXsetdblparam(env, CPX_PARAM_SOLNPOOLGAP,
+			      *REAL(VECTOR_ELT(control, i)));
+      #endif
+    }
+    else if(strcmp(cur_parm, "solnpoolintensity") == 0){
+      #if CPX_VERSION >= 1100
+      status = CPXsetintparam(env, CPX_PARAM_SOLNPOOLINTENSITY, 
+    			      *INTEGER(VECTOR_ELT(control, i)));
+      #endif
+    }
+    else if (strcmp(cur_parm, "breaksymmetry") == 0){ /*ADDED BY XT*/
+      #if CPX_VERSION >= 1100
+      status = CPXsetintparam(env, CPX_PARAM_SYMMETRY,
+    			      *INTEGER(VECTOR_ELT(control, i)));
+      #endif
+    } else if (strcmp(cur_parm, "tuning.display") == 0){
+      #if CPX_VERSION >= 1100
+      status = CPXsetintparam(env, CPX_PARAM_TUNINGDISPLAY,
+    			      *INTEGER(VECTOR_ELT(control, i)));
+      #endif
+    } else if (strcmp(cur_parm, "tuning.rep") == 0){
+      #if CPX_VERSION >= 1100
+      status = CPXsetintparam(env, CPX_PARAM_TUNINGREPEAT,
+    			      *INTEGER(VECTOR_ELT(control, i)));
+      #endif
+    } else if (strcmp(cur_parm, "tuning.tilim") == 0){
+      #if CPX_VERSION >= 1100
+      status = CPXsetdblparam(env, CPX_PARAM_TUNINGTILIM,
+    			      *REAL(VECTOR_ELT(control, i)));
+      #endif
+    }
+    else {
+      /* If parameter not known print a warning */
+      warning("Unknown CPLEX parameter %s. Ignoring it.\n", cur_parm);
+    }
+
+    if (status)
+      my_error(("Failure to set parameter %s, error %d.\n", cur_parm, status));
+  }
+  
+  UNPROTECT(1);
+}
+
+/* ADDED BY MARCIN!!
+ * 
+ */
+void setstarts(CPXENVptr env, CPXLPptr lp, SEXP control, int isMIP) {
+  int i, status, value;
+  const char *cur_parm;
+  SEXP names;
+  
+  /* get list names */
+  PROTECT(names = getAttrib(control, R_NamesSymbol));
+
+  status = 1; /* avoid warning */
+
+  /* for each element in 'control' try to set the corresponding
+     parameter */
+  if (isMIP) {
+    for (i = 0; i < length(control); i++) {
+
+      cur_parm = CHAR(STRING_ELT(names, i));
+
+      if(strcmp(cur_parm, "mipstart") == 0) {
+        SEXP values_vec = VECTOR_ELT(control, i);
+        double* values = REAL(values_vec);
+        int mcnt = 1;
+        int beg = 0;
+        int nzcnt = LENGTH(values_vec);
+        SEXP tmp = allocVector(INTSXP, nzcnt);
+        int *varindices = INTEGER(tmp);
+        int effortlevel = CPX_MIPSTART_AUTO;
+        char *mipstartname = "start";
+
+        for (int i = 0; i < nzcnt; i++) {
+          varindices[i] = i;
+        }
+
+        status = CPXaddmipstarts(env, lp, mcnt, nzcnt, &beg, varindices, values, &effortlevel, &mipstartname);
+
+        if (status)
+          my_error(("Failure to set parameter %s in setstarts, error %d.\n", cur_parm, status));
+      }
+    }
+  }
+  
+  UNPROTECT(1);
+}
diff --git a/src/Rcplex_utils.c b/src/Rcplex_utils.c
new file mode 100644
index 00000000..b71f1a91
--- /dev/null
+++ b/src/Rcplex_utils.c
@@ -0,0 +1,102 @@
+// Utility functions like CPLEX initialization/closing routines and
+// list accessors
+
+#include "Rcplex2.h"
+#include <time.h>
+
+CPXENVptr env;
+CPXLPptr lp;
+int numcalls;
+int forceCplxClose;
+
+void Rcplex_wait (int seconds) {
+  clock_t endwait;
+  endwait = clock() + seconds * CLOCKS_PER_SEC;
+  while (clock() < endwait) {}
+}
+
+void Rcplex_init(void) {
+  int numtries = 10;
+  int status;
+  char errmsg[1024];
+
+  /* Initialize CPLEX environment */
+  if (env == NULL) {
+    env = CPXopenCPLEX (&status);
+    while(env == NULL && numtries > 0) {
+      Rcplex_wait(30);
+      numtries--;
+    }
+    if (env == NULL) {
+      CPXgeterrorstring(env, status, errmsg);
+      error("Could not open CPLEX environment.\n%s\n",errmsg);
+    }
+    REprintf("CPLEX environment opened\n");
+    numcalls = max_numcalls;
+  } 
+  else {
+    numcalls--;
+  }
+  forceCplxClose = 0;
+}
+
+void Rcplex_close(void) {
+  forceCplxClose = 1;
+  Rcplex_free();
+}
+
+void Rcplex_free(void) {
+  int status1, status2;
+  char errmsg[1024];
+
+  status1 = status2 = 0;
+  if (lp != NULL) {
+    status1 = CPXfreeprob(env,&lp);
+    /*    REprintf("Freed CPLEX problem\n");*/
+    lp = NULL;
+  }
+
+  if (env != NULL && (numcalls == 0 || forceCplxClose)) {
+    status2 = CPXcloseCPLEX(&env);
+    REprintf("Closed CPLEX environment\n");
+    env = NULL;
+  }
+
+  if (status1 || status2) {
+    status2 ? strcpy(errmsg,"env close ok") : CPXgeterrorstring(env,status2,errmsg);
+    error("Rcplex_free failed: free problem code: %d\nClose environment msg: %s\n",status1,errmsg);
+  }
+  return;
+}
+
+SEXP getListElement(SEXP list, char *str) {
+  SEXP element = R_NilValue, names = getAttrib(list, R_NamesSymbol);
+  int i;
+
+  for (i=0; i < length(list); i++) {
+    if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) {
+      element = VECTOR_ELT(list,i);
+      break;
+    }
+  }
+  return element;
+}
+
+SEXP setListElement(SEXP list, char* str, double value) {
+  /* SEXP element = R_NilValue, */
+  SEXP out = allocVector(VECSXP, length(list));
+  SEXP names = getAttrib(list, R_NamesSymbol);
+
+  setAttrib(out, R_NamesSymbol, names);
+  int i;
+
+  for (i=0; i < length(list); i++) {
+    if (strcmp(CHAR(STRING_ELT(names,i)), str) == 0) {
+      /* element = VECTOR_ELT(list,i); */
+      SET_VECTOR_ELT(out, i, ScalarReal(value));
+    } else {
+      SET_VECTOR_ELT(out, i, VECTOR_ELT(list, i));
+    }
+  }
+  return out;
+}

From 403831289f85c295f37878c0ab256e2323d317e0 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 3 May 2021 19:17:00 -0400
Subject: [PATCH 09/35] remove Rcplex dependency

---
 R/apps.R   |   38 +-
 R/apps.txt | 1856 ----------------------------------------------------
 R/gGnome.R |   37 +-
 3 files changed, 36 insertions(+), 1895 deletions(-)
 delete mode 100644 R/apps.txt

diff --git a/R/apps.R b/R/apps.R
index 08aca630..526415d2 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -1300,7 +1300,6 @@ balance = function(gg,
   ub = vars$ub
 
   control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1)
-  ## sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control)
 
     if (gurobi) {
         sol = run.gurobi(
@@ -2846,21 +2845,20 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
     ##     ub = rep(ub, length.out = pmax(len(lb), len(ub)))
     ## }
     ## TODO: implement lb and ub of walk CNs
-    sol = Rcplex::Rcplex(
-        cvec = c,
-        Amat = A,
-        bvec = b,
-        sense = sense, 
-        Qmat = NULL,
-        lb = lb,
-        ub = ub,
-        n = n.sol,
-        objsense = "min", 
-        vtype = vtype,
-        control = list(
-            trace = ifelse(verbose >= 1, 1, 0),
-            tilim = 100,
-            epgap = 1))
+    sol = Rcplex2(cvec = c,
+                  Amat = A,
+                  bvec = b,
+                  sense = sense, 
+                  Qmat = NULL,
+                  lb = lb,
+                  ub = ub,
+                  n = n.sol,
+                  objsense = "min", 
+                  vtype = vtype,
+                  control = list(
+                      trace = ifelse(verbose >= 1, 1, 0),
+                      tilim = 100,
+                      epgap = 1))
     
     if (!is.null(sol$xopt)) {
         sol = list(sol)
@@ -2878,10 +2876,10 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
         Ahat = rbind(A, P)
         bhat = c(b, p)
         sensehat = c(sense, rep("L", length(p)))
-        sol.new = Rcplex::Rcplex(cvec = c, Amat = Ahat, bvec = bhat, 
-            sense = sensehat, Qmat = NULL, lb = lb, ub = ub, 
-            n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= 
-                1, 1, 0), tilim = 100, epgap = 1))
+        sol.new = Rcplex2(cvec = c, Amat = Ahat, bvec = bhat, 
+                          sense = sensehat, Qmat = NULL, lb = lb, ub = ub, 
+                          n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= 
+                                                                                                    1, 1, 0), tilim = 100, epgap = 1))
         if (length(sol.new) == 0) {
             rerun = F
         }
diff --git a/R/apps.txt b/R/apps.txt
deleted file mode 100644
index 21c7f517..00000000
--- a/R/apps.txt
+++ /dev/null
@@ -1,1856 +0,0 @@
-#' applications of gGraph
-
-#' @name balance
-#' @title balance gGnome graphs 
-#' @description
-#'
-#' Here we analyze gGraphs with "cn" (copy number) field to enforce integer
-#' cn and junction balance, ie sum of incoming (or outgoing) edge
-#' cn should be equal to node copy cn.
-#'
-#' The goal is to find a balaned assignment of "cn" to the nodes and edges of the gGraph
-#' that maximally resemble the input weights while minimizing the loose end penalty.
-#' The similarity / distance function can be weighted by optional node / edge
-#' metadata field $weight (when weighted = TRUE). 
-#'
-#' To output this gGraph, we design a MIP with
-#' (1) objective function that minimizes (weighted sum of square) distance of fit node and junction copy number to provided values in
-#'     $cn field 
-#' (2) and lambda* the sum of copy number at non terminal loose ends subject to 
-#' (3) junction balance constraint
-#' (4) fixing copy number of a subset of nodes and junctions
-#'
-#' Objective weight can be modulated at nodes and edges with $weight metadata
-#' field (default node weight is node width, and edge weight is 1).
-#' These fields will then set the penalty incurred to a fit of x to that node / edge
-#' with copy number c and weight w as (x-c)^2/w.
-#' 
-#' Lambda can be modulated at nodes with $lambda node metadata field (default 1)
-#'
-#' For "haplographs" ie graphs that have more than one node overlapping a given location, it may
-#' be important to constrain total copy number using a haploid read depth signal.
-#' The marginal parameter enables this through a GRanges annotated with $cn and optionally $weight
-#' field that provides a target total copy number that the optimization will attempt to satisfy.
-#' This provided copy number c and weight w (default 1) will be evaluated against the
-#' sum s of the fit copy numbers of all nodes overlapping that location by adding a penalty
-#' of (c-s)^2/w to the corresponding solution. marginal can also have an optional logical field
-#' $fix that will actually constrain the marginal copy number to be equal to the provided value
-#' (note: that the optimization may be infeasible, and function will error out)
-#' 
-#' Additional controls can be inputted by changing the graph metadata - e.g. adding fields
-#' $lb and $ub to nodes and edges will constrain their fit copy number to those bounds.
-#' Adding $reward field to edges will add a reward for each copy of that edge in the solution.
-#' 
-#' 
-#' @param gg gGraph with field $cn, can be NA for some nodes and edges, optional field $weight which will adjust the quadratic penalty on the fit to x as (x-$cn)^2/weight
-#' @param lambda positive number specifying default loose end penalty (100), note if gg$node metadata contain $lambda field then this lambda will be multiplied by the node level lambda
-#' @param marginal GRanges with field $cn and optional $weight field will be used to fit the summed values at each base of the genome to optimally fit the marginal value, optional field $fix will actually constrain the marginal to be the provided value
-#' @param tight indices or epxression on node metadata specifying at which nodes to disallow loose ensd
-#' @param nfix indices or expression on node metadata specifying which node cn to fix
-#' @param efix indices or expression on edge metadata specifying which edge cn to fix
-#' @param nrelax indices or expression on node metadata specifying which nodes cn to relax
-#' @param erelax  indices or expression on edge metadata specifying which edges cn to relax
-#' @param L0  flag whether to apply loose end penalty as L1 (TRUE)
-#' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph   (FALSE)
-#' @param phased (bool) indicates whether to run phased/unphased. default = FALSE
-#' @param M  big M constraint for L0 norm loose end penalty, should be >1000
-#' @param verbose integer scalar specifying whether to do verbose output, value 2 will spit out MIP (1)
-#' @param tilim time limit on MIP in seconds (10)
-#' @param epgap relative optimality gap threshhold between 0 and 1 (0.01)
-#' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda.
-#' @author Marcin Imielinski
-#' @export 
-balance = function(gg,
-                   lambda = 0.1,
-                   marginal = NULL,
-                   tight = NULL,
-                   nfix = NULL, efix = NULL, nrelax = NULL, erelax = NULL,
-                   L0 = TRUE,
-                   loose.collapse = FALSE,
-                   M = 1e2,
-                   phased = FALSE,
-                   verbose = 1,
-                   tilim = 10,
-                   epgap = 0.01)
-{
-  if (!('cn' %in% names(gg$nodes$dt)))
-  {
-    warning('cn field not defined on nodes, setting to NA')    
-    gg$nodes$mark(cn = NA_real_)
-  }
-
-  if (!('cn' %in% names(gg$edges$dt)))
-  {
-    warning('cn not defined on edges, providing NA')    
-    gg$edges$mark(cn = NA_real_)
-  }
-
-  gg = gg$copy
-  
-  ## default local lambda lambda is node width
-  if (!('lambda' %in% names(gg$nodes$dt)))
-    gg$nodes$mark(lambda = 1)
-#    gg$nodes$mark(lambda = width(gg$nodes$gr))
-
-  ## default node weight is its width
-  if (!('weight' %in% names(gg$nodes$dt)))
-  {
-      gg$nodes$mark(weight = width(gg$nodes$gr))
-  }
-
-  ## default edge weight is its width
-  if (!('weight' %in% names(gg$edges$dt)))
-  {
-    gg$edges$mark(weight = 1)
-  }
-
-  ## default reward is 0 
-  if (!('reward' %in% names(gg$edges$dt)))
-  {
-    gg$edges$mark(reward = 0)
-  }
-  
-  ## handle parsing of efix, nfix, nrelax, erelax
-  if (!any(deparse(substitute(nfix)) == "NULL")) ## R voodo to allow "with" style evaluation 
-    nfix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nfix)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL)
-
-  if (!any(deparse(substitute(nrelax)) == "NULL")) ## R voodo to allow "with" style evaluation 
-    nrelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nrelax)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL)
-
-  if (!any(deparse(substitute(efix)) == "NULL")) ## R voodo to allow "with" style evaluation 
-    efix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(efix)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL)
-
-  if (!any(deparse(substitute(erelax)) == "NULL")) ## R voodo to allow "with" style evaluation 
-    erelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(erelax)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL)
-
-  if (!any(deparse(substitute(tight)) == "NULL")) ## R voodo to allow "with" style evaluation 
-    tight = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(tight)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL)
-
-
-  if (is.logical(nfix))
-    nfix = which(nfix)
-
-  if (is.logical(efix))
-    efix = which(efix)
-  
-  if (is.logical(nrelax))
-    nrelax = which(nrelax)
-
-  if (is.logical(erelax))
-    erelax = which(erelax)
-
-  if (length(nfix) & verbose)
-    message('Fixing ', length(nfix), ' nodes')
-
-  if (length(efix) & verbose)
-    message('Fixing ', length(efix), ' edges')
-
-  if (length(nrelax) & verbose)
-    message('Relaxing ', length(nrelax), ' nodes')
-
-  gg$nodes[nrelax]$mark(weight = 0)
-  
-  if (length(erelax) & verbose)
-    message('Relaxing ', length(erelax), ' edges')
-
-  gg$nodes[erelax]$mark(weight = 0)
-
-  if (!is.logical(tight))
-    tight = 1:length(gg$nodes) %in% tight
-
-  if (any(tight) & verbose)
-    message('Leaving ', sum(tight), ' nodes tight')
-  
-  gg$nodes$mark(tight = tight)
-
-  if (is.null(gg$nodes$dt$lb))
-    gg$nodes$mark(lb = 0)
-
-  if (is.null(gg$nodes$dt$ub))
-    gg$nodes$mark(ub = Inf)
-
-  if (is.null(gg$edges$dt$lb))
-    gg$edges$mark(lb = 0)
-
-  if (is.null(gg$edges$dt$ub))
-    gg$edges$mark(ub = Inf)
-
-  if (loose.collapse)
-  {
-    if (verbose)
-      message('Collapsing loose ends')
-
-    uleft = unique(gr.start(gg$nodes$gr))
-    uright = unique(gr.end(gg$nodes$gr))
-    
-    gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l'))
-    gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r'))      
-  }
-  else
-  {
-    gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l'))
-    gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r'))
-  }  
-  
-  ########
-  ## VARIABLES
-  ########
-
-  ## create state space, keeping track of graph ids
-  vars = rbind(
-    gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes
-    gg$sedgesdt[, .(from, to, lb, ub, sedge.id,  cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges
-
-    ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE
-    ## will be defined on the basis of coordinate overlap)
-    gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
-                            ulid = paste0(index, 'i'),
-                            lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)),
-                            type = 'loose.in', vtype = 'I')], ## incoming loose ends
-    gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
-                            ulid = paste0(index, 'o'),
-                            lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)),
-                            type = 'loose.out', vtype = 'I')], ## outgoing loose ends
-    gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual 
-    gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual 
-    fill = TRUE)
-
-  if (phased) {
-    if (verbose) {
-      message("adding indicator variables for edge CN")
-    }
-
-    ## add og.edge.id information for each edge
-    sedge.to.og.dt = gg$edges$dt[,
-                                 .(sedge.id, og.edge.id, ## map sedge.id to og.edge.id
-                                   ref.or.alt = type, ## get REF or ALT annotations (important for constraints)
-                                   connection) ## get straight/cross annotations
-                                 ]
-    setkey(sedge.to.og.dt, "sedge.id")
-
-    ## add binary indicator variables for each edge
-    edge.indicator.vars = vars[type == "edge",][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id]
-    setkey(edge.indicator.vars, "sedge.id")
-
-    ## use sedge.id as a key to join with edge metadata
-    edge.indicator.vars = edge.indicator.vars[sedge.to.og.dt]
-
-    if (verbose) {
-      message("adding indicator sum variables for edge CN")
-    }
-
-    edge.indicator.sum.vars = vars[type == "edge",][, type := "edge.indicator.sum"][, vtype := "I"]
-    setkey(edge.indicator.sum.vars, "sedge.id")
-
-    ## make sure there is one edge indicator sum variable for each og.edge.id
-    edge.indicator.sum.vars = edge.indicator.sum.vars[sedge.to.og.dt]
-    edge.indicator.sum.vars[, gid := og.edge.id]
-
-    ## add one indicator sum variable per og edge ID to vars table
-    vars = rbind(vars, edge.indicator.vars, fill = TRUE)
-    vars = rbind(vars,
-                 unique(edge.indicator.sum.vars, by = "gid"),
-                 fill = TRUE) ## fill is TRUE because og.edge.id and ref.or.alt added
-
-    if (verbose) {
-      message("adding major/minor allele CN and og.node.id to vars")
-    }
-
-    ## idea here is to add variables that force major allele CN to be at least as large as minor CN
-
-    ## create data table where keys are node ids, and og.node.id/allele can be easily found
-    snode.to.og.dt = gg$nodes$dt[, .(snode.id, og.node.id, allele)]
-    setkey(snode.to.og.dt, "snode.id")
-
-    ## merge og.node.id and allele information into vars
-    vars = merge(vars, snode.to.og.dt, by="snode.id", all.x = TRUE)
-  }
-
-  if (L0)
-  {
-    ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true
-    ## (i.e. we need indicator.sum and indicator.sum.indicator
-    vars = rbind(vars, 
-                 rbind( 
-                   vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid],
-                   vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid]
-                 ))
-
-    if (loose.collapse)
-    {
-      ## sum will sum all the loose ends assocaited with the same lid
-      vars = rbind(vars, 
-                   unique(rbind( 
-                     vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid],
-                     vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid]
-                   ), by = 'gid'))
-      
-      ## sum.indicator is an binary indicator on the sum
-      vars = rbind(vars, 
-                   rbind( 
-                     vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid],
-                     vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid]
-                   ))        
-    }        
-  }
-  
-  setkeyv(vars, c('type', 'gid'))
-
-  ## add marginal copy number residual if specified
-  vars$mfix = NA
-  if (!is.null(marginal))
-  {
-    if (verbose) {
-      message("adding marginal CN variables")
-    }
-
-    if (!inherits(marginal, 'GRanges') || is.null(marginal$cn))
-    {
-      stop('marginal must be a GRanges with field $cn')
-    }
-    
-    if (is.null(marginal$weight))
-      marginal$weight = 1
-
-    if (is.null(marginal$fix))
-      marginal$fix = FALSE
-    
-    ## first disjoin marginal against the nodes
-    ## ie wee ned to create a separate residual variable for every unique
-    ## disjoint overlap of marginal with the nodes
-    dmarginal = gg$nodes$gr %>% gr.stripstrand %*% grbind(marginal %>% gr.stripstrand) %>% disjoin %$% marginal[, c('cn', 'weight', 'fix')] %Q% (!is.na(cn)) %Q% (!is.na(weight)) %Q% (!is.infinite(weight))
-
-    vars = rbind(vars,
-                 gr2dt(dmarginal)[, .(cn, weight, mfix = fix>0, rid = 1:.N, type = 'mresidual', vtype = 'C')],
-                 fill = TRUE
-                 )
-    message("Done adding marginal vars")
-  }
-
-  vars[, id := 1:.N] ## set id in the optimization
-  vars[is.na(lb), lb := -Inf]
-  vars[is.na(ub), ub := Inf]
-  vars[, relax := FALSE][, fix := FALSE]
-  vars[type == 'mresidual' & mfix == TRUE, ":="(lb = 0, ub = 0)]
-  vars[type %in% c('node', 'edge'), lb := pmax(lb, 0, na.rm = TRUE)]
-  vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)]
-  vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)]
-
-
-  ## figure out junctions and nodes to fix
-  vars[!is.na(cn) & type == 'node' & abs(snode.id) %in% nfix, ":="(lb = cn, ub = cn, fix = TRUE)]
-  vars[!is.na(cn) & type == 'edge' & abs(sedge.id) %in% efix, ":="(lb = cn, ub = cn, fix = TRUE)]
-
-  ## figure out terminal node sides for in and out loose ends
-  ## these will not have loose ends penalized
-  qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
-  term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
-              -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
-  term.out = -term.in
-  vars$terminal = FALSE
-  vars[type %in% c('loose.in', 'loose.in.indicator') & snode.id %in% term.in, terminal := TRUE]
-  vars[type %in% c('loose.out', 'loose.out.indicator') & snode.id %in% term.out, terminal := TRUE]
-
-  ########
-  ## CONSTRAINTS
-  ## the key principle behind this "melted" form of constraint building is the cid 
-  ## (constraint id) which is the key that will group coefficients into constraints
-  ## when we finally build the matrices.  So all we need to do is make sure that
-  ## that value / cid pairs make sense and that every cid has an entry in b
-  ########
-
-  ## we need one junction balance constraint per loose end
-
-  ## constraints indexed by cid
-  constraints = rbind(
-    vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))],
-    vars[type == 'edge', .(value = 1, id, cid = paste('in', to))],
-    vars[type == 'node', .(value = -1, id, cid = paste('in', gid))],
-    vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))],
-    vars[type == 'edge', .(value = 1, id, cid = paste('out', from))],
-    vars[type == 'node', .(value = -1, id, cid = paste('out', gid))],
-   fill = TRUE)
-
-  b = rbind(
-    vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))],
-    vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))],
-    fill = TRUE)
-
-  ## add to the constraints the definitions of the node and edge
-  ## residuals
-  constraints = rbind(
-    constraints,
-    rbind(
-      vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))],
-      vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))],
-      vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))],
-      vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))],
-      fill = TRUE)
-  )
-
-  b = rbind(b,
-            vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))],
-            vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))],
-            fill = TRUE)
-
-  ## add the reverse complement equality constraints on nodes and edges
-  constraints = rbind(
-    constraints,
-    rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id)
-      vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))],
-      vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))],
-      fill = TRUE)
-  )
-  
-  b = rbind(b,
-            vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))],
-            vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))],
-            fill = TRUE)
-
-  if (phased)
-  {
-    #'#########################
-    ## add constraints forcing major CN to be larger than minor CN
-    #'#########################
-    if (!("allele" %in% colnames(vars)) | !("og.node.id" %in% colnames(vars))) {
-      stop("allele field needs to be added to vars")
-    }
-
-    ## major allele coefficient is 1, minor allele coefficient is -1
-    ## make sure that there's only one per node.id (abs of snode.id)
-    allele.constraints = rbind(
-      unique(
-        vars[type == "node" & allele == "major",
-             .(value = 1, id, node.id = abs(snode.id),
-               cid = paste("allele.constraint", og.node.id))],
-        by = "node.id"),
-      unique(
-        vars[type == "node" & allele == "minor",
-             .(value = -1, id, node.id = abs(snode.id),
-               cid = paste("allele.constraint", og.node.id))],
-        by = "node.id"),
-      fill = TRUE)
-
-    ## add these constraints
-    constraints = rbind(constraints,
-                        allele.constraints[, .(value, id, cid)],
-                        fill = TRUE)
-
-    ## RHS: force (major CN - minor CN) to be >= 0
-    allele.rhs = unique(
-      vars[type == "node",
-           .(value = 0, sense = "G", cid = paste("allele.constraint", og.node.id))],
-      by = "cid")
-
-    b = rbind(b, allele.rhs, fill = TRUE)
-
-    #'#########################
-    ## add constraints that force indicators to be 1 if edge CN > 0
-    ## TODO: fix this so that only one indicator per edge.id (instead of one per sedge.id)
-    #'#########################
-
-    ## add constraints for upper bound (same setup as L0 penalty) - one per edge
-    iconstraints = vars[type == "edge", .(value = 1, id,
-                                          sedge.id, 
-                                          cid = paste("edge.indicator.ub", sedge.id))]
-
-    ## add matching indicator variables, matching by cid
-    iconstraints = rbind(
-      iconstraints,
-      vars[type == "edge.indicator", ][
-        sedge.id %in% iconstraints$sedge.id, .(value = -M, id, cid = iconstraints$cid, sedge.id)],
-      fill = TRUE)
-
-    ## upper bound is M if indicator is positive, and zero otherwise
-    ## we may want to change this later to JaBbA CN instead of (potentially large) M for stability?
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
-
-    ## add the RHS of this constraint (upper bound)
-    b = rbind(
-      b,
-      vars[type == "edge", .(value = 0, sense = "L", cid = paste("edge.indicator.ub", sedge.id))],
-      fill = TRUE
-    )
-
-    ## add constraints for the lower bound
-    iconstraints = vars[type == "edge", .(value = 1, id, sedge.id, cid = paste("edge.indicator.lb", sedge.id))]
-
-    ## add matching indicator variables for LB
-    iconstraints = rbind(
-      iconstraints,
-      vars[type == "edge.indicator", ][
-        sedge.id %in% iconstraints$sedge.id, .(value = -0.1, id, cid = iconstraints$cid, sedge.id)],
-      fill = TRUE)
-
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
-
-    ## add the RHS of this constraint (upper bound)
-    b = rbind(
-      b,
-      vars[type == "edge", .(value = 0, sense = "G", cid = paste("edge.indicator.lb", sedge.id))],
-      fill = TRUE
-    )
-
-    ###################
-    ## add the edge indicator sum constraints
-    ###################
-
-    ## ALT edges: only one of four edges can have nonzero CN
-    ## set upper bound (no need to set lower bound because these are binary variables and cannot be negative)
-    iconstraints = unique(
-      vars[type == "edge.indicator" & ref.or.alt == "ALT",
-           .(value = 1, id, og.edge.id,
-             edge.id = abs(sedge.id),
-             cid = paste("edge.indicator.sum.ub", og.edge.id))],
-      by = "edge.id"
-    )
-
-    constraints = rbind(
-      constraints,
-      iconstraints[, .(value, id, cid)],
-      fill = TRUE)
-
-    b = rbind(b,
-              vars[type == "edge.indicator.sum" & ref.or.alt == "ALT",
-                   .(value = 1, sense = "L", 
-                     cid = paste("edge.indicator.sum.ub", og.edge.id))],
-              fill = TRUE)
-
-
-
-    ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output)
-    iconstraints = unique(
-      vars[type == "edge.indicator" & ref.or.alt == "ALT",
-           .(value = 1, id, og.edge.id,
-             edge.id = abs(sedge.id),
-             cid = paste("edge.indicator.sum.lb", og.edge.id))],
-      by = "edge.id"
-    )
-
-    constraints = rbind(
-      constraints,
-      iconstraints[, .(value, id, cid)],
-      fill = TRUE)
-
-    b = rbind(b,
-              vars[type == "edge.indicator.sum" & ref.or.alt == "ALT",
-                   .(value = 1, sense = "G", 
-                     cid = paste("edge.indicator.sum.lb", og.edge.id))],
-              fill = TRUE)
-
-    ## REF edges: up to two of four edges can have nonzero CN (easiest to implement...)
-    iconstraints = unique(
-      vars[type == "edge.indicator" & ref.or.alt == "REF",
-           .(value = 1, id,
-             edge.id = abs(sedge.id),
-             cid = paste("edge.indicator.sum.ub", og.edge.id))],
-      by = "edge.id"
-    )
-
-    constraints = rbind(
-      constraints,
-      iconstraints[, .(value, id, cid)],
-      fill = TRUE)
-
-    b = rbind(b,
-              vars[type == "edge.indicator.sum" & ref.or.alt == "REF",
-                   .(value = 2, sense = "L", 
-                     cid = paste("edge.indicator.sum.ub", og.edge.id))],
-              fill = TRUE)
-  }
-  
-
-
-  if (L0) ## add "big M" constraints
-  {
-    ## indicator constraints ie on ulids 
-    iconstraints = rbind(
-      vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))],
-      vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))],
-      fill = TRUE)
-
-    ## add the matching indicator variables, matching to the cid from above
-    iconstraints = rbind(
-      iconstraints,
-      vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
-        match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)],
-      fill = TRUE)
-                         
-    ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
-
-    ## upper bound sense is 'L' i.e. less than because -M on left hand side
-    b = rbind(b,
-              vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))],
-              vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))],
-              fill = TRUE)
-
-    ## lower bound 0.1 if indicator positive, 0 otherwise
-    iconstraints = rbind(
-      vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))],
-      vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))],
-      fill = TRUE)
-    
-    ## add the matching indicator variables, matching to the cid from above
-    iconstraints = rbind(
-      iconstraints,
-      vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
-        match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)],
-      fill = TRUE)
-
-    ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
-
-    ## lower bound sense is 'G' i.e. greater than because -M on left hand side
-    b = rbind(b,
-              vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))],
-              vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))],
-              fill = TRUE)
-
-    if (loose.collapse)
-    {
-      ##################
-      ## loose indicator sum  = sum of indicators
-      ##################
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))],
-        vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))],
-        fill = TRUE)
-      
-      ## indicator sum is the sum of all indicators mapping to that loose end
-      iconstraints = rbind(
-          iconstraints,
-        unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
-          match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'),
-        fill = TRUE)
-      
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
-      
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))],
-                fill = TRUE)
-     
-      ##################
-      ## now we make new indicator variables on the sum of the individual loose end indicators      
-      ## upper bound bound 0.1 if indicator positive, 0 otherwise
-      ##################
-
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
-        vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
-        fill = TRUE)
-
-      ## add the matching indicator variables, matching to the cid from above
-      iconstraints = rbind(
-        iconstraints,
-        vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][
-          match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)],
-        fill = TRUE)
-      
-      ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
-
-      ## upper bound sense is 'L' i.e. less than because -M on left hand side
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
-                fill = TRUE)
-
-      ## lower bound 0.1 if indicator positive, 0 otherwise
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
-        vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
-        fill = TRUE)
-      
-      ## add the matching indicator variables, matching to the cid from above
-      iconstraints = rbind(
-        iconstraints,
-        vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
-          match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)],
-        fill = TRUE)
-
-      ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
-
-      ## lower bound sense is 'G' i.e. greater than because -M on left hand side
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
-                fill = TRUE)
-
-    }    
-  }
-
-  if (!is.null(marginal) && length(dmarginal)) 
-  {
-    ## match against nodes and store query.id as rid
-    ## this will be the constraint id that will allow us
-    ## to sum the appropriate nodes to constrain to the residual
-    ov = dmarginal[, c('cn', 'weight')] %*% gg$nodes$gr %>% gr2dt
-    ov[, rid := query.id]
-
-    constraints = rbind(
-      constraints,
-      rbind(
-        ## match up vars and marginal by snode.id and populate coefficients
-        merge(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))],
-        ## the residual is the difference between the sum and marginal cn
-        vars[type == 'mresidual' & rid %in% ov$rid, .(value = -1, id, cid = paste('mresidual', rid))],        
-        fill = TRUE),
-      fill = TRUE
-    )
-
-    b = rbind(b,
-              vars[type == 'mresidual' & rid %in% ov$rid, .(value = cn, sense = 'E', cid = paste('mresidual', rid))],
-              fill = TRUE)
-  }
-
-  ########
-  ## MAKE MATRICES
-  ########
-
-  ## now Rcplex time
-  ## remove any rows with b = NA
-
-  b = b[!is.na(value), ]
-  constraints = constraints[cid %in% b$cid, ]
-
-  ## convert constraints to integers
-  ucid = unique(b$cid)
-  b[, cid.char := cid]
-  b[, cid := cid %>% factor(ucid) %>% as.integer]
-  constraints[, cid.char := cid]
-  constraints[, cid := cid %>% factor(ucid) %>% as.integer]
-  setkey(b, cid)
-
-  ## create constraint matrix, Qmat, and cobj, lb, ub from vars and constraints  lambda = 10
-  Amat = sparseMatrix(constraints$cid, constraints$id, x = constraints$value, dims = c(length(ucid), nrow(vars)))
-  vars[is.na(weight), weight := 0]
-
-  if (any(ix <- is.infinite(vars$weight)))
-  {
-    warning('nodes with infinite weight, setting to 0, please check inputs')
-    vars[ix, weight := 0]
-  }
-  Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix')
-
-  ## set lambda to 0 at terminal or other non NA nodes
-  vars[is.na(lambda), lambda := 0]
-
- 
-  ## set cvec by multiplying global lambda by local lambda for non-terminal loose end
-  ## vars (or their indicators if L0 is TRUE)
-  if (L0)
-    {
-      if (loose.collapse)
-        {
-          cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
-        }
-      else
-        {
-          cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
-        }
-    }
-  else
-    cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric)
-
-  ## implement reward if provided
-  if (length(ix <- which(vars$reward!=0)))
-  {
-    if (verbose)
-      message('Applying reward')
-    cvec[ix] = -vars$reward[ix]
-  }
-
-  lb = vars$lb
-  ub = vars$ub
-  bvec = b[.(1:nrow(Amat)), value]
-  sense = b[.(1:nrow(Amat)), sense]
-
-  control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1)
-  sol = Rcplex::Rcplex(cvec = cvec, Amat = Amat, bvec = bvec, Qmat = Qmat, lb = lb, ub = ub, sense = sense, vtype = vars$vtype, objsense = 'min', control = control)
-  vars$cvec = cvec
-  vars$x = sol$x
-
-  ## for debugging
-  ppc = function(x) (x %>% merge(vars, by = 'id') %>% merge(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep=  '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char]
-  
-  ppv = function(x) {tmp = x %>% merge(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc}
-
-  .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))),
-                                  sense)
-  chk = .check(sol$x)
-
-  if (any(is.na(sol$x)))
-    stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output')
-
-  if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] |
-      chk[sense == 'G', any(obs < 0, na.rm = TRUE)] |
-      chk[sense == 'L', any(obs > 0, na.rm = TRUE)])
-    stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter')
-
-   ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x
-
- 
-  ## update graph  
-  nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))]
-  emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))]
-
-  loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id]
-  looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id]
-
-  nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings
-  nodes$loose.left = loosei$cn>0
-
-  nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings
-  nodes$loose.right = looseo$cn>0
-
-  gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0)
-  gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn)
-  gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn)
-
-  ## cache old cn values
-  gg$nodes$mark(cn.old = gg$nodes$dt$cn)
-  gg$edges$mark(cn.old = gg$edges$dt$cn)
-  gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue
-  gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue
-  gg$nodes[nmark$nid]$mark(cn = nmark$cn)
-  gg$edges[emark$eid]$mark(cn = emark$cn)
-  gg$set(y.field = 'cn')
-
-  gg$set(obj = sol$obj)
-
-##  fix loose ends
-  nodes = gg$nodes 
-  nodes$loose.left = nodes$dt$loose.cn.left>0
-  nodes$loose.right = nodes$dt$loose.cn.right>0
-
-  return(gg)
-}
-
-
-#' @name nodestats
-#' @description nodestats
-#'
-#' Computes copy number (CN) "stats" using on graph gg using (binned) copy number data in GRanges data to prep a
-#' graph for balance function (see above).
-#'
-#' The function outputs a graph whose nodes are populated with $cn and $weight fields.  The cn field is computed as
-#' the centroid (measured by function FUN) and the weight is computed as number of bins / 2 * (sample variance) across
-#' the bins overlapping that node.
-#'
-#' If loess = TRUE is specified, then a mean vs variance curve is computed across all nodes and the sample variance in each node
-#' is replaced with the mapping assigned to the centroid
-#' 
-#' The node to bin mapping can be additionally restricted by the columns in "by", which must exist in both the node metadata
-#' of gg and data
-#' 
-#' @param gg gGraph to compute node stats across
-#' @param data GRanges of cn data to compute node stats against
-#' @param cn.field numeric field corresponding to cn data which is used to aggregate (default is first column in the data)
-#' @param loess flag whether to use LOESS to compute variance as mean to variance 
-#' @param FUN function to compute centroids with (mean)
-#' @param by by field to match data against node data, the column specified in this field must be present in both the gGraph (NULL)
-#' @return gGraph with fields $cn
-#' @export
-#' @author Marcin Imielinski
-nodestats = function(gg,
-                   data,
-                   cn.field = names(values(data))[[1]],
-                   FUN = mean,
-                   loess = FALSE, 
-                   by = NULL)
-{
-  gg = gg$copy
-  data$cn = values(data)[cn.field]
-  ov = gr.findoverlaps(gg$nodes$gr, data[, "cn"], scol = 'cn', by = by)  %>% gr2dt
-  dt = ov[, .(mean = FUN(cn, na.rm = TRUE), var = var(cn, na.rm = TRUE), nbins = .N), keyby = query.id][.(1:length(gg$nodes)), ]
-  dt$weight = dt$nbins/(2*dt$var)
-  dt[is.infinite(weight), weight := NA]
-  gg$nodes$mark(cn = dt$mean, weight = dt$weight)
-  return(gg)
-}
-
-
-
-#' @name transplant
-#' @title transplant donor subggraph into recipient
-#' @description
-#'
-#' Here we transplant a subgraph (eg representing an event)
-#' into a recipient graph.  The transplant operation will
-#' (1) fix the donor ALT junction copy numbers
-#' (2) leave the subgraph node copy numbers unconstrained
-#' (3) minimize the loose ends of the resulting balanced graph
-#'
-#' Donor and recipient graph should both have $cn field defined on
-#' nodes and edges. Donor can also just be junctions, in which case
-#' the junction shadow will be taken as the footprint, or a
-#' footprint can be manually provided.  The footprint
-#' will then provide a region where the copy number constraints
-#' are relaxed while loose ends strictly enforced. 
-#' 
-#' Currently transplant is only well defined for haploid / unphased graphs ie those
-#' where there is at most a single interval representing a given
-#' reference genomic regions. 
-#' 
-#' @param gg "haploid" gGraph with field $cn
-#' @param donor donor (haploid) gGraph or Junction object with field $cn 
-#' @param footprint if gGraph not provided then footprint can be directly provided 
-#' @param lambda loose end penalty to apply to balance operation (1000) to nodes within the donor region
-#' @param L0 flag whether to use L0 penalty (FALSE)
-#' @return balanced gGraph with donor junctions incorporated at their donor cn
-#' @author Marcin Imielinski 
-#' @export
-transplant = function(gg, donor, footprint = NULL, lambda = 1000, L0 = FALSE)
-{
-  if (!('cn' %in% names(gg$nodes$dt) & 'cn' %in% names(gg$edges$dt)))
-    stop('cn field must be defined on the nodes and edges recipient graph')
-
-  tmp = (gg$nodes$gr %>% gr.sum)
-  is.haploid = sum(tmp$score>1)==0
-
-  if (!is.haploid)
-    stop('transplant is only defined on haploid graphs, please check your graph for overlapping nodes')
-
-  gg = gg$copy
-
-  if (inherits(donor, 'gGraph'))
-  {
-    footprint = donor$footprint
-    junctions = donor$junctions[type == 'ALT']
-    gg$disjoin(donor$gr)
-  } else
-  {
-    junctions = donor
-    if (is.null(footprint)) ## in this case gg is a junction
-    {
-      footprint = junctions$shadow
-    }
-    gg$disjoin(grbind(junctions$grl %>% unlist, footprint))
-  }
-  
-  if (length(junctions)>0 && !('cn' %in% names(junctions$dt)))
-    stop('cn field must be defined on the edges of the donor graph')
-
-  combined.junctions = gg$junctions[type == 'ALT']
-  if (length(junctions)) ## mark as donor and combine with gg $junctions
-    {
-      junctions$set(donor = TRUE)
-      combined.junctions = c(combined.junctions, junctions[, c('cn', 'donor')])
-    }
-  
-  ## mark edges of the donor subgraph as donor = TRUE
-  gg$edges$mark(donor = FALSE)
-
-  ## make new gGraph combining junctions from recipient and donor
-  ggn = suppressWarnings(gG(breaks = gg$nodes$gr,
-                            junctions = combined.junctions))
-
-
-  ## fix cn on the edges of the donor subgraph
-  efix = ggn$edges[type == 'ALT']$dt$edge.id
-
-  fp.nodes = ggn$nodes %&% footprint
-
-  ## relax loose ends everywhere but in the subgraph
-  this.lambda = lambda
-  ggn$nodes$mark(lambda = 1)
-  fp.nodes$mark(lambda = this.lambda)
-
-  ## balance combined gGraph
-  ggn = balance(ggn, efix = efix, nrelax = fp.nodes$dt$node.id, L0 = L0)
-  
-  return(ggn)
-}
-
-#' @name bcheck
-#' @title bcheck
-#' @description
-#'
-#' Checks if genome graph with node and edge cn's annotated and loose cn's is balanced.
-#' 
-#' @param gg gGraph with node and edge fields 'cn', node field loose.cn.left and loose.cn.right
-bcheck = function(gg)
-{
-  lmerge = merge(gg$nodes$dt, gg$nodes$eleft$dt, by.x = 'node.id', by.y = 'n2', all.x = TRUE)
-  rmerge = merge(gg$nodes$dt, gg$nodes$eright$dt, by.x = 'node.id', by.y = 'n1', all.x = TRUE)
-
-  if (!('loose.cn.left' %in% names(lmerge)))
-    {
-      lmerge[, loose.cn.left := 0]
-    }
-
-  if (!('loose.cn.right' %in% names(rmerge)))
-    {
-      rmerge[, loose.cn.right := 0]
-    }
-
-  out = merge(lmerge[, .(left = cn.x[1] - sum(cn.y, na.rm = TRUE) - loose.cn.left[1]), keyby = node.id],
-              rmerge[, .(right = cn.x[1] - sum(cn.y, na.rm = TRUE) - loose.cn.right[1]), keyby = node.id])[.(gg$nodes$dt$node.id), ]
-  out[, balanced := left == 0 & right == 0]
-
-  return(out)
-}
-
-#' @name loosefix
-#' @title loosefix
-#' @description
-#'
-#' Updates loose.left, loose.right, loose.cn.left, loose.cn.right based on value of cn field
-#' 
-#' @param gg gGraph with node and edge fields 'cn', node field loose.cn.left and loose.cn.right
-#' @author Marcin Imielinski
-loosefix = function(gg)
-{
-  lmerge = merge(gg$nodes$dt, gg$nodes$eleft$dt, by.x = 'node.id', by.y = 'n2', all.x = TRUE)
-  rmerge = merge(gg$nodes$dt, gg$nodes$eright$dt, by.x = 'node.id', by.y = 'n1', all.x = TRUE)
-
-  ## mark any cn NA as 0
-  gg$nodes[is.na(cn)]$mark(cn = 0)
-  gg$edges[is.na(cn)]$mark(cn = 0)
-
-  out = merge(lmerge[, .(loose.cn.left = cn.x[1] - sum(cn.y, na.rm = TRUE)), keyby = node.id],
-              rmerge[, .(loose.cn.right = cn.x[1] - sum(cn.y, na.rm = TRUE)), keyby = node.id])[.(gg$nodes$dt$node.id), ]
-
-
-  out[is.na(loose.cn.left), loose.cn.left := 0]
-  out[is.na(loose.cn.right), loose.cn.right := 0]
-
-  if (any(ix <- (out$loose.cn.left<0 | out$loose.cn.right<0)))
-    stop(sprintf('some nodes (%s) have a higher incoming or outgoing edge copy number than the number of node copies', paste(ix %>% which, collapse = ',')))
-
-  gg$nodes$mark(loose.cn.left = out$loose.cn.left,
-                loose.cn.right = out$loose.cn.right)
-
-  nodes = gg$nodes;
-  nodes$loose.left = out$loose.cn.left > 0
-  nodes$loose.right = out$loose.cn.right > 0
-
-  return(gg)
-}
-
-
-#' @name peel
-#' @title peel
-#' @description
-#'
-#' Greedily "peels" walks off a genome graph with node and edge field "cn"
-#' by finding successive flows that maximizes some edge metadata field (specified as field)
-#' which if not specified will be a logical field type == 'ALT' specifying whether that
-#' junction is an ALT edge
-#'
-#' @param gg gGraph with field cn
-#' @param field edge metadata field to use to rank walk solutions (default edge field type == 'ALT')s
-#' @param verbose flag = 1 regular verbosity, 2 = dump out Rcplex traces
-#' @param embed.loops logical flag (FALSE) if TRUE will embed all the loops in the output into an arbitrary linear (path) walk
-#' @author Marcin Imielinski
-#' @return collection of gWalks annotated with cn on the original that when added will give the marginal copy profile on inputted nodes and edges
-#' @export
-peel = function(gg, field = NULL, embed.loops = FALSE, verbose = FALSE)
-{
-  gg = refresh(loosefix(gg))
-  gg.og = refresh(gg)
-
-  ## mini function to compute the cn of a walk in a graph by finding the min cn across its associated
-  ## nodes, edges, and loose ends
-  .mincn = function(walks)
-  {
-    source = walks$eval(node = snode.id[1])
-    sink = walks$eval(node = snode.id[.N])
-
-    source.loose = walks$eval(node = ifelse(strand[1] == '+', loose.cn.left[1], loose.cn.right[1]))
-    sink.loose =  walks$eval(node = ifelse(strand[.N] == '+', loose.cn.right[.N], loose.cn.left[.N]))
-
-    edge.min = Inf
-    if (walks$edges %>% length)
-      edge.min = walks$eval(edge = data.table(cn, id = abs(sedge.id))[, .(CN = cn[1]/.N), by = id][, min(floor(CN),  na.rm = TRUE)])
-
-    node.min = walks$eval(node = data.table(cn, id = abs(snode.id))[, .(CN = cn[1]/.N), by = id][, min(floor(CN),  na.rm = TRUE)])
-
-    pmin(
-      ifelse(walks$circular, Inf, ## if circular no loose end capacity constraints
-             ## fold back edge case where we mihgt overestimate our loose end capacity
-      ifelse(source == -sink, floor(source.loose/2), 
-             pmin(source.loose, sink.loose))), # otherwise we are limited by the loose end cn on each walk side
-      edge.min, ## and the internal edge cn
-      node.min ## and the internal node cn
-    )
-  }
-   
-
-  if (is.null(field))
-  {
-    gg$edges$mark(alt = gg$edges$dt$type == 'ALT')
-    field = 'alt'
-  }
-
-  ## .check = function(gw, gg)
-  ## {
-  ##   gg2 = gW(grl = rep(gw$grl, gw$dt$cn), circular = rep(gw$circular, gw$dt$cn))$graph;
-  ##   gg2$nodes$mark(cn = 1);
-  ##   gg2$edges$mark(cn = 1);
-  ##   gg2 = c(gg$copy, gg2)$disjoin()
-  ##   gg2$nodes$mark(CN = gg2$nodes$dt$cn)
-  ##   return(gg2)
-  ## }
-
-  out = gW(graph = gg)    
-
-  ## loop next batch of max flow walks until no more walks
-  while (length(walks <- gg$maxflow(walk = TRUE, path.only = FALSE, multi = TRUE, efield = field, cfield = 'cn', verbose = verbose)))
-  {
-    ## gg2 = .check(out, gg)
-    ## cc =  gr2dt(gg.og$nodes$gr %*% gg2$nodes$gr)[CN != cn, .(node.id, cn, CN)]
-    ## ee = merge(gg.og$junctions, gg2$junctions)$dt[cn != cn.1, .(edge.id, edge.id, cn, cn.1)]
-    ## if (nrow(cc))
-    ##   browser()
-
-    ## first check for any walks that contain
-    ## a node from another walk (edge cases involving reverse complements)
-    walks = walks[rev(order(walks$dt$wid))]
-    dups = dunlist(walks$snode.id)[, .(count = length(unique(listid)), listid = unique(listid)),
-                              by = .(id = abs(V1))][count>1, ]
-
-    if (walks$edges %>% length)
-    {
-      dups = rbind(dups, dunlist(walks$sedge.id)[, .(count = length(unique(listid)), listid = unique(listid)),
-                              by = .(id = paste0('e', abs(V1)))][count>1, ])
-    }
-    
-    if (nrow(dups))
-    {
-      keep = !(1:length(walks) %in% dups[duplicated(id), listid %>% unique])
-      walks = walks[keep]
-    }
-
-
-    ## cn of walk is the min cn of edges, correcting for edges that get hit multiple times
-    walks$set(cn = .mincn(walks))
-
-    if (verbose)
-      message('Peeling off ', length(walks), ' walks with max width ',
-              max(walks$dt$wid), ' and max copy ', max(walks$dt$cn, na.rm = TRUE))
-
-    ## add batch to output
-    out = c(walks, out)
-    
-    ## peel off graph
-    ## annotate walk with min cn
-    while (length(walks <- walks[cn>0]))
-    {
-      ## figure out how much to decrement
-      ## aggregate in case both strands of node is in a walk
-#      gg.cache = gGnome::refresh(gg)
-
-      ndec = dunlist(walks$snode.id)[, wcn := walks$dt$cn[listid]][, .(dec = sum(wcn)), keyby = .(nid = abs(V1))]
-      ndec$cn = gg$nodes[ndec$nid]$dt$cn - ndec$dec
-      gg$nodes[ndec$nid]$mark(cn = ndec$cn)
-
-      if (walks$edges %>% length)
-      {
-        edec = dunlist(walks$sedge.id)[, wcn := walks$dt$cn[listid]][!is.na(V1), .(dec = sum(wcn)), keyby = .(eid = abs(V1))]
-        edec$cn = gg$edges[edec$eid]$dt$cn - edec$dec
-        gg$edges[edec$eid]$mark(cn = edec$cn)
-      }
-            
-      ## to recompute walk cn have to take into account edges that get hit more than once
-      gg = loosefix(gg)
-      walks$set(cn = .mincn(walks))
-#      bc = bcheck(gg)[balanced==FALSE, ]      
-
-      ## if (gg$nodes$dt[, any(loose.cn.left<0 | loose.cn.right<0)])
-      ##   browser()
-
-      ## if (nrow(bc))
-      ##   browser()
-    }
-
-    if (verbose)
-    {
-      ploidy = gg$nodes$dt[, sum(cn*width, na.rm = TRUE)/sum((1+0*cn)*width, na.rm = TRUE)]
-        message('... remaining ploidy ', round(ploidy,2), ' across ', sum(gg$nodes$dt$cn>0, na.rm = TRUE), ' nodes and ', sum(gg$edges$dt$cn>0), ' edges with nonzero CN remaining in graph with total ', length(out), ' walks in collection')
-      }
-  }
-
-  ## recast walks around original (input) graph
-  walks = gW(snode.id = out$snode.id, circular = out$circular, meta = out$dt, graph = gg.og)
-  walks = walks[rev(order(wid))]
-
-  if (embed.loops)
-    walks = embedloops(walks[circular == TRUE], walks[circular == FALSE], verbose = verbose>1)
-
-  return(walks)
-}
-
-
-#' @name embedloops
-#' @description embedloops
-#'
-#' Attempts to embed / tarnsplant a set of circular walks (loops)
-#' into a set of recipients, both defined on the same gGraph, and 
-#' both optionally having the metadata $cn.  Returns a gWalk with as
-#' many of the loops embedded into the recipients as possible.
-#'
-#' A loop needs to intersect some node in order to be successfully embedded.
-#' (note: that node may be in another loop, if that loop gets embedded)
-#' (As a result though, we don't guarantee that every loop will be enbedded)
-#'
-#' Noter: loops with cn>1 will be embedded in tandem.  
-#'
-#' @param loops circular gWalk object
-#' @param recipient set of gWalks defined on the same object
-#' @return gWalk with as many loops embedded into recipients as possible
-#' @export
-#' @author Marcin Imielinski
-embedloops = function(loops, recipients, verbose = FALSE)
-{
-  if (length(loops)==0)
-    return(recipients)
-
-  if (length(recipients)==0)
-    return(loops)
-
-  if (!all(loops$circular))
-    stop('Loops must be circular walks')
-
-  ## if cn not set then set to 1
-  if (is.null(loops$dt$cn))
-    loops$set(cn = 1)
-
-  ## if cn not set then set to 1
-  if (is.null(recipients$dt$cn))
-    recipients$set(cn = 1)
-
-  ## subroutine to embed
-  .embed = function(donor, recipients)
-  {
-    entry = dunlist(recipients$snode.id)[abs(V1) %in% abs(donor$snode.id[[1]]), ][1,]
-    
-    if (is.na(entry$V1))
-      return(list(embedded = FALSE, recipients = recipients))
-    
-    if (entry$V1 %in% -donor$snode.id[[1]])
-      donor = donor[-1] ## reverse complement
-    
-    ## pivot donor around candidate node
-    snid = donor$snode.id[[1]]
-    ix = match(entry$V1, snid)
-    snidp = snid[c(ix:length(snid))]
-    snidp = c(snidp, setdiff(snid, snidp))
-    
-    this.recipient = recipients[entry$listid %>% as.integer]
-    snidr = this.recipient$snode.id[[1]]
-
-    ix = match(entry$V1, snidr)
-
-    ## insert (several copies) of pivoted donor cycle just upstream of the entry point
-    snid.new = c(snidr[seq_len(ix-1)], rep(snidp, donor$dt$cn), snidr[ix:length(snidr)])
-
-    ## make new walk
-    gw.new = gW(snode.id = list(snid.new), graph = this.recipient$graph, circular = this.recipient$circular, meta = this.recipient$meta)
-
-    ## if recipient walk has more than one copy then we will need to preserve additional
-    ## "original" copies of this.recipient walk
-    if (this.recipient$dt$cn>1)
-    {
-      this.recipient$set(cn = this.recipient$dt$cn - 1)
-      gw.new$set(cn = 1)
-      gw.new = c(gw.new, this.recipient)
-    }
-   
-    recipients = c(recipients[setdiff(1:length(recipients), entry$listid)], gw.new)
-
-    recipients$set(numalt = recipients$eval(edge = sum(type == 'ALT')))
-    recipients = recipients[rev(order(numalt))]
-                   
-    if (verbose)
-    {
-      message('Embedded loop ', paste(snidp, collapse = '->'), ' into recipient ', paste(snidr, collapse = '->'), ' giving ', paste(snid.new, collapse = '->'))
-    }
-
-    return(list(embedded = TRUE, recipients = recipients))
-  }
-  
-  embedded = done = rep(FALSE, length(loops))
-
-  while (!all(done))
-  {
-    old.recipients = copy(recipients)
-    for (i in which(!done))
-    {
-      res = .embed(loops[i], recipients)
-      embedded[i] = done[i] = res$embedded
-      recipients = res$recipients
-    }
-
-    ## we give up if a round of embedding does not change recipients
-    if (identical(old.recipients$snode.id, recipients$snode.id))
-      done = rep(TRUE, length(loops))
-  }
-
-  ## return (modified) recipients and any leftover loops
-  return(c(recipients, loops[which(!embedded)]))
-}
-
-#' @name binstats
-#' @title binstats
-#' @description
-#'
-#' Given GRanges of binned eg read depth data with field $cn, crosses 
-#' nodes in graph and annotates graph nodes with
-#' 'cn' and 'weight'.  Done upstream of balance. 
-#'
-#' If "by" field(s) specified and these fields exist in both
-#' bins and graph nodes, then will use these in the overlaps query
-#'
-#' If field, purity, and ploidy provided then will
-#' also transform read depth data in bin column "field"
-#' using purity and ploidy to generate
-#' @param gg gGraph
-#' @param bins GRanges with field $cn or field field
-#' @param by optional character vector specifying metadata field(s) shared by gg$nodes and bins to which bin / node overlaps will be limited to
-#' @param field character vector field of bins to convert to (NULL)
-#' @param purity purity parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL
-#' @param ploidy ploidy parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL
-#' @param min.bins minimum number of bins to use for intra segment variance computation (3)
-#' @param loess logical flag whether to smooth / fit variance using loess (FALSE)
-#' @param min.var minimal allowable per segment bin variance, which will ignore segments with very low variance due to all 0 or other reasons (0.1)
-#' @return gGraph whose nodes are annotated with $cn and $weight field
-#' @export
-#' @author Marcin Imielinski
-binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity, ploidy = gg$meta$ploidy, loess = TRUE, min.bins = 3, verbose = TRUE, min.var = 0.1)
-{
-  gg = gg$copy
-
-  if (!is.null(field) & !is.null(purity) & !is.null(ploidy) && is.numeric(purity) && is.numeric(ploidy))
-  {
-    if (verbose)
-      message('Converting ', field, ' to cn using purity ', purity, ' and ploidy ', ploidy)
-
-    bins$cn = rel2abs(bins, field = field, purity = purity, ploidy = ploidy)
-  }
-
-  if (is.null(bins$cn))
-    stop('bins must have field cn or a field, purity, and ploidy must be specified where field is a column in bins')
-
-  if (verbose)
-    message('crossing nodes and bins via gr.findoverlaps')
-  ov = gr.findoverlaps(gg$nodes$gr, bins, by = by, scol = names(values(bins)), return.type = 'data.table')
-  if (verbose)
-    message('aggregating bin stats per node')
-  dt = ov[!is.na(cn), .(mean = mean(cn, na.rm = TRUE), var = var(cn, na.rm = TRUE), nbins = .N), keyby = query.id][.(1:length(gg$nodes)), ]
-  dt[nbins<min.bins, var := NA]
-
-  if (loess)
-  {
-    min.var = pmax(min.var, min(dt$var, na.rm = TRUE)) ## min allowable var
-    loe = dt[!is.na(var) & !is.na(mean), loess(var ~ mean, weights = nbins)] ## loe = tmp[, loess(var ~ mean)] ##                  
-    dt$var.obs = dt$var
-    dt$var = pmax(min.var, predict(loe, dt$mean))
-  }
-
-
-  if (verbose)
-    message('computing weights and returning')
-  dt$weight = dt$nbins/(2*dt$var)
-
-
-  if (any(is.infinite(dt$weight), na.rm = TRUE))
-    warning('variance computation yielded infinite weight, consider setting nbins higher or using loess fit')
-
-  gg$nodes$mark(cn = dt$mean, weight = dt$weight)
-  return(gg)
-}
-
-#' @name phased.binstats
-#' @title phased.binstats
-#' @description
-#'
-#' Given GRanges minor and major allele CN and a balanced but unphased gGraph,
-#' prepares phased gGraph input to balance.
-#' TODO:
-#' If field, purity, and ploidy provided then will
-#' also transform read depth data in bin column "field"
-#' using purity and ploidy to generate
-#' @param gg gGraph
-#' @param bins GRanges with field $cn or field field
-#' @param purity purity parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL
-#' @param ploidy ploidy parameter either specified together with field or embedded in gg$meta, must be specified if field is not NULL
-#' @param min.bins minimum number of bins to use for intra segment variance computation (3)
-#' @param loess logical flag whether to smooth / fit variance using loess (FALSE)
-#' @param verbose (bool)
-#' @param min.var minimal allowable per segment bin variance, which will ignore segments with very low variance due to all 0 or other reasons (0.1)
-#' @param mc.cores (int) number of cores
-#' @return gGraph whose nodes are annotated with $cn and $weight field
-phased.binstats = function(gg, bins = NULL, purity = gg$meta$purity, ploidy = gg$meta$ploidy, loess = TRUE, min.bins = 3, verbose = TRUE, min.var = 0.1, mc.cores = 8)
-{
-  #' prepare skeleton for phased gGraph (to be populated with CN estimates)
-  if (verbose == TRUE) {
-    message("Preparing phased gGraph...")
-  }
-
-  #' create GRanges corresponding to nodes of major and minor allele graphs
-  n.nodes = length(gg$nodes) ## get number of nodes in the original unphased graph
-  major.nodes.gr = gg$nodes$gr[,c("node.id")]
-  minor.nodes.gr = gg$nodes$gr[,c("node.id")]
-  #' assign unique node.id and store original node id as og.node.id
-  names(values(major.nodes.gr)) = c("og.node.id") ## store original node ID
-  names(values(minor.nodes.gr)) = c("og.node.id")
-  major.nodes.gr$node.id = major.nodes.gr$og.node.id ## keep node.id of major allele
-  minor.nodes.gr$node.id = minor.nodes.gr$og.node.id + n.nodes ## shift node.id of minor allele
-  #' label whether node belongs to major or minor allele
-  major.nodes.gr$allele = "major"
-  minor.nodes.gr$allele = "minor"
-
-  #' create data.tables corresponding to  edges that go straight across
-  major.edges.dt = gg$edges$dt[, .(og.edge.id = edge.id,
-                                   n1.side, n2.side,
-                                   n1, n2,
-                                   connection = "straight")] ## indicate connection type
-  minor.edges.dt = gg$edges$dt[, .(og.edge.id = edge.id,
-                                   n1.side, n2.side,
-                                   n1 = n1 + n.nodes, ## convert n1, n2 node.id to minor allele counterparts
-                                   n2 = n2 + n.nodes,
-                                   connection = "straight")] ## indicate connection type
-  #' get data.table for edges that cross from major to minor allele
-  cross.edges.dt = mclapply(1:nrow(major.edges.dt),
-                            function(ix) {
-                              row = major.edges.dt[ix,]
-                              n1.side = row$n1.side
-                              n2.side = row$n2.side
-                              new.n1 = row$n1
-                              new.n2 = row$n2 + n.nodes ## covert n2 to minor allele node index
-                              new.row = data.table(
-                                og.edge.id = row$og.edge.id,
-                                n1.side = n1.side,
-                                n2.side = n2.side,
-                                n1 = c(row$n1, row$n1 + n.nodes),
-                                n2 = c(row$n2 + n.nodes, row$n2),
-                                connection = "cross" ## indicate connection type
-                              )
-                              return(new.row)
-                            },
-                            mc.cores = mc.cores) %>% rbindlist()
-
-  #' create new gGraph
-  phased.nodes = c(major.nodes.gr, minor.nodes.gr)
-  phased.edges = list(major.edges.dt, minor.edges.dt, cross.edges.dt) %>% rbindlist()
-  phased.gg = gG(nodes = phased.nodes, edges = phased.edges)
-
-  #' update edge colors for plotting
-  phased.gg$edges[connection == "cross" & type == "REF"]$mark(col = "light blue")
-  phased.gg$edges[connection == "cross" & type == "ALT"]$mark(col = "pink")
-  phased.gg$edges[connection == "straight" & type == "REF"]$mark(col = "blue")
-  phased.gg$edges[connection == "straight" & type == "ALT"]$mark(col = "red")
-
-  #' update node colors for plotting
-  phased.gg$nodes[allele == "major"]$mark(col = "red")
-  phased.gg$nodes[allele == "minor"]$mark(col = "blue")
-
-  #' check that bins has required fields for minor.cn and major.cn
-  if (is.null(bins$major.cn) | is.null(bins$minor.cn)) {
-    stop("bins must have fields major.cn and minor.cn")
-  }
-
-  #' overlap major/minor alleles with bins separately
-  if (verbose) {
-    message("crossing nodes and bins via gr.findoverlaps")
-  }
-  #' prepare bins for finding overlaps by adding allele and cn columns
-  major.bins = granges(bins[,"major.cn"], use.mcols = TRUE)
-  minor.bins = granges(bins[,"minor.cn"], use.mcols = TRUE)
-  names(values(major.bins)) = c("cn") ## change metadat column name to cn
-  names(values(minor.bins)) = c("cn")
-  major.bins$allele = "major"
-  minor.bins$allele = "minor"
-  ov = gr.findoverlaps(phased.nodes, ## concatenated GRanges for phased gGraph
-                       c(major.bins, minor.bins), ## concatenate major and minor bins
-                       by = c("allele"), ## only find overlaps if alleles field is matching
-                       qcol = c("node.id"),
-                       scol = c("allele", "cn"),
-                       return.type = "data.table")
-
-  #' compute bin stats per node
-  if (verbose) {
-    message("aggregating bin stats per node")
-  }
-  dt = ov[!is.na(cn),
-          .(mean = mean(cn, na.rm = TRUE),
-            var = var(cn, na.rm = TRUE),
-            nbins = .N),
-          keyby = node.id] %>%
-    merge(ov[which(!duplicated(node.id)), .(node.id, allele)],
-          by = "node.id",
-          all.y = TRUE) %>% ## right join (since na.rm was true, might be missing some nodes)
-    .[order(node.id)] ## sort by node id
-
-  #' set variance to NA if number of bins is less than specificied minimum
-  dt[nbins < min.bins, var := NA]
-
-  #' compute weights (nbins / variance)
-  ## for now adding a jitter
-  dt[, ":="(weight = nbins / (2 * var + 1e-2))]
-
-  #' add cn (dt$mean) and weight to phased gGraph
-  phased.gg$nodes$mark(cn = dt$mean, weight = dt$weight)
-
-  if (any(is.infinite(dt$weight), na.rm = TRUE)) {
-    warning('variance computation yielded infinite weight, consider setting min.bins higher or using loess fit')
-  }
-
-  return(phased.gg)
-}
-
-
-
-#' @name fitcn
-#' @title fitcn
-#' @author Julie Behr, Xiaotong Yao
-#'
-#' @param gw input gWalks
-#' @param cn.field character column names of each graph's CN data
-#' 
-#' @export
-fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NULL, verbose = TRUE, 
-                  min.alt = TRUE, edgeonly = FALSE, evolve = FALSE, n.sol = 2, return.gw = TRUE,
-                  sep = "_")
-{
-    ## gw = self$copy
-    gw = gw$copy
-    gg = gw$graph$copy
-    stopifnot(all(cn.field %in% colnames(gg$nodes$dt)) & all(cn.field %in% colnames(gg$edges$dt)))
-    ## if (is.null(gw$graph$nodes$dt$cn) | is.null(gw$graph$edges$dt$cn)) {
-    ##     stop("cn field is missing from node and edge metadata")
-    ## }
-    gg$nodes$mark(cn = rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])))
-    rcn = sapply(cn.field, function(colnm) gg$nodes$eval(sum(cn)))
-    lcn = sapply(cn.field, function(colnm) gg$nodes$eval(sum(cn), right = FALSE))
-    rcn = gg$nodes$eval(sum(cn))
-    lcn = gg$nodes$eval(sum(cn), right = FALSE)
-    jbal.left = all(
-    (lcn == rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & !gg$nodes$dt$loose.left) | 
-    (lcn <= rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & gg$nodes$dt$loose.left),
-    na.rm = TRUE)
-    jbal.right = all(
-    (rcn == rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & !gg$nodes$dt$loose.right) |
-    (rcn <= rowSums(as.matrix(gg$nodes$dt[, cn.field, with = FALSE])) & gg$nodes$dt$loose.right),
-    na.rm = TRUE)
-    if (!jbal.left | !jbal.right) 
-        warning("graph does not appear to be junction balanced, please check inputs")
-    ## helper function to get the problem right
-    constrain.evolution = function(K, gw, A, b, sense){
-        h = K[gw$edges[type == "ALT"]$dt[!duplicated(edge.id), 
-                                         edge.id], ]
-        A = rbind(A, cbind(sparseMatrix(1, 1, x = 0, dims = dim(h)), 
-                           h))
-        b = c(b, rep(1, nrow(h)))
-        sense = c(sense, rep("L", nrow(h)))
-        return(list(A = A, b = b, sense = sense))
-    }
-    constrain.observations = function(obs.mat, A, b, cvec, sense, 
-                                      vtype) {
-        if (!(ncol(obs.mat) * 2) == ncol(A)) 
-            stop("input obs.mat contains the wrong number of columns; should match length of gw")
-        p = nrow(obs.mat)
-        w = ncol(obs.mat)
-        Zero = sparseMatrix(1, 1, x = 0, dims = c(2 * w * p, 
-                                                  2 * w * p))
-        A0 = Zero[rep(1, nrow(A)), 1:(2 * p)]
-        Ap = cbind(Zero[rep(1, p), 1:w], sign(obs.mat), diag(rep(-1, 
-                                                                 p)), Zero[rep(1, p), 1:p])
-        Mpub = cbind(Zero[rep(1, p), 1:(2 * w)], diag(rep(1, 
-                                                          p)), diag(rep(-1e+07, p)))
-        Mplb = cbind(Zero[rep(1, p), 1:(2 * w)], diag(rep(1, 
-                                                          p)), diag(rep(-0.1, p)))
-        Amp = rbind(cbind(A, A0), Ap, Mpub, Mplb)
-        b = c(b, rep(0, 3 * p))
-        cvec = c(cvec, rep(0, p), -1 * rowMax(obs.mat))
-        sense = c(sense, rep("E", p), rep("L", p), rep("G", p))
-        vtype = c(vtype, rep("I", p), rep("B", p))
-        return(list(A = Amp, b = b, c = cvec, sense = sense, 
-                    vtype = vtype))
-    }
-    generate.Ke = function(gw) {
-        dt = gw$edgesdt[, c("walk.id", "sedge.id")][, `:=`(edge.id, abs(sedge.id))]
-        dt$listid = factor(dt$walk.id, 1:length(gw))
-        dt$edge.id = factor(dt$edge.id, gg$edgesdt$edge.id)
-        cdt = dcast(dt[!is.na(sedge.id), ], listid ~ edge.id, 
-                    fun.aggregate = length, value.var = "edge.id", drop = FALSE)
-        mat = cdt[, -1]
-        rownames(mat) = cdt$listid
-        return(t(mat))
-    }
-    generate.Kn = function(gw) {
-        dt = gw$nodesdt[, c("walk.id", "snode.id")][, `:=`(node.id, 
-                                                           abs(snode.id))]
-        dt$listid = factor(dt$walk.id, 1:length(gw))
-        dt$node.id = factor(dt$node.id, gg$nodes$dt$node.id)
-        cdt = dcast(dt[!is.na(snode.id), ], listid ~ node.id, 
-                    fun.aggregate = length, value.var = "node.id", drop = FALSE)
-        mat = cdt[, -1]
-        rownames(mat) = cdt$listid
-        return(t(mat))
-    }
-    generate.Amat = function(K, nblock = 1) {
-        M = 1e+07
-        K = as(K, "sparseMatrix")
-        w = ncol(K)
-        Zero = sparseMatrix(1, 1, x = 0, dims = c(2 * w * nblock, 2 * w * nblock))
-        Amub = cbind(
-            do.call(`cbind`, lapply(seq_len(nblock), function(i) diag(rep(1, w)))), diag(rep(-M, w))
-        )
-        Amlb = cbind(
-            do.call(`cbind`, lapply(seq_len(nblock), function(i) diag(rep(1, w)))), diag(rep(-0.1, w))
-        )
-        A = rbind(
-            ## cbind(K, Zero[rep(1, nrow(K)), (w + 1:w)]),
-            cbind(Reduce(`diagc`, lapply(seq_len(nblock), function(i) K)), Zero[rep(1, nrow(K) * nblock), (w + 1:w)]),
-            Amub,
-            Amlb
-        )
-        return(A)
-    }
-    generate.bvec = function(e, K) {
-        w = ncol(K)
-        bvec = c(c(e), rep(0, 2 * w))
-        return(bvec)
-    }
-    generate.cvec = function(K, weight, min.alt, gw, nblock = 1) {
-        if (!is.null(weight) | min.alt) 
-            weight = prep.weight(K, weight, min.alt, gw)
-        w = ncol(K)
-        if (is.null(weight)) 
-            weight = rep(1, w)
-        cvec = c(rep(0, w * nblock), weight)
-        return(cvec)
-    }
-    prep.weight = function(K, weight, min.alt, gw) {
-        if (!is.null(weight)) {
-            if (length(weight) == 1 & is.character(weight) & 
-                weight %in% colnames(gw$dt)) 
-                weight = gw$dt[, weight, with = F]
-            if (!(is.numeric(weight))) {
-                stop("weight must either be numeric vector of same length as gw or the name of a single numeric annotation in gw")
-            }
-        }
-        if (min.alt) {
-            if (!is.null(weight)) {
-                warning("modifying input weight to satisfy min.alt=TRUE")
-            }
-            else weight = rep(1, ncol(K))
-            numalt = gw$eval(edge = sum(type == "ALT"))
-            weight[is.na(numalt) | numalt == 0] = 0
-        }
-        return(weight)
-    }
-    generate.vtype = function(K, nblock = 1) {
-        w = ncol(K)
-        vtype = c(rep("I", w * nblock), rep("B", w))
-        return(vtype)
-    }
-    generate.sense = function(K, nblock = 1) {
-        w = ncol(K)
-        r = nrow(K)
-        sense = c(rep("E", r * nblock), rep("L", w), rep("G", w))
-        return(sense)
-    }
-    diagc = function(mat1, mat2){
-        out = matrix(0, nrow = nrow(mat1) + nrow(mat2), ncol = ncol(mat1) + ncol(mat2))
-        el1 = which(as.matrix(mat1)!=0, arr.ind = TRUE)
-        el2 = el2.ori = which(as.matrix(mat2)!=0, arr.ind = TRUE)
-        el2[, "row"] = el2.ori[, "row"] + nrow(mat1)
-        el2[, "col"] = el2.ori[, "col"] + ncol(mat1)
-        out[rbind(el1, el2)] = c(mat1[el1], mat2[el2.ori])
-        return(out)
-    }
-    ## start building the problem
-    if (any(!cn.field %in% colnames(gw$graph$edges$dt))) {
-        stop("cn field must be populated in the input graph node and edges metadata")
-        ## already stopped, what is this for??
-        ## if (trim) {
-        ##     e = rep(1, length(unique(abs(unlist(gw$sedge.id)))))
-        ##     e2 = rep(1, length(unique(abs(unlist(gw$snode.id)))))
-        ## }
-        ## else {
-        ##     e = rep(1, length(gw$graph$edges))
-        ##     e2 = rep(1, length(gw$graph$nodes))
-        ## }
-    }
-    else {
-        if (trim) {
-            e = as.matrix(
-                gw$graph$edges[sedge.id %in% abs(unlist(gw$sedge.id))]$dt[, cn.field, with = FALSE])
-            e2 = as.matrix(
-                gw$graph$nodes[snode.id %in% abs(unlist(gw$snode.id))]$dt[, cn.field, with = FALSE])
-        }
-        else {
-            e = as.matrix(gw$graph$edges$dt[, cn.field, with = FALSE])
-            e2 = as.matrix(gw$graph$nodes$dt[, cn.field, with = FALSE])
-        }
-    }
-    if (edgeonly) {
-        K = generate.Ke(gw)
-    }
-    else {
-        K = rbind(generate.Ke(gw), generate.Kn(gw))
-        e = rbind(e, e2)
-    }
-    if (nrow(K) != nrow(e)) {
-        stop("Mismatch between size of A matrix and length of b vector. Some edges in gw$graph are not covered by gw. If this was intended, try trim=TRUE")
-     }
-    K.unit = K
-    K = Reduce(`diagc`, lapply(seq_len(ncol(e)), function(i) K))
-    ## a bit of cheating, doing the old procedure column by column as vectors
-    ## tmp = lapply(seq_len(ncol(e)), function(i){
-    ##     this.e = e[, i, drop = TRUE]
-    ##     A = generate.Amat(unname(K))
-    ##     b = generate.bvec(this.e, K)        
-    ##     sense = generate.sense(K)
-    ##     return(list(A = A, b = b, sense = sense, vtype = vtype))
-    ## })
-    A = generate.Amat(unname(K.unit), ncol(e))
-    b = generate.bvec(e, K.unit)
-    sense = generate.sense(K.unit, ncol(e))
-    vtype = generate.vtype(K.unit, ncol(e))
-    ## restructure the objective function
-    c = generate.cvec(K.unit, weight, min.alt, gw, ncol(e))
-    ## row bind them into final constraints
-    ## browser()
-    if (evolve) {
-        ll = constrain.evolution(K, gw, A, b, sense)
-        A = ll$A
-        b = ll$b
-        sense = ll$sense
-    }
-    if (!is.null(obs.mat)) {
-        ll = constrain.observations(obs.mat, A, b, c, sense, 
-                                    vtype)
-        A = ll$A
-        b = ll$b
-        c = ll$c
-        sense = ll$sense
-        vtype = ll$vtype
-    }
-    ## browser()
-    ## load customized lb and ub if present ## TODO make ub lb specific to samples
-    if (any(grepl("lb", colnames(gw$dt)))){
-        if (identical(grep("lb", colnames(gw$dt), value = TRUE), "lb")){
-            lb = c(rep(gw$dt$lb, ncol(e)), rep(0, ncol(K.unit)))
-        } else {
-            lb.cols = gsub("cn", "lb", cn.field)
-            if (!all(lb.cols %in% colnames(gw$dt))){
-                lb = 0
-            }
-            lb = do.call("c", c(lapply(lb.cols, function(x) gw$dt[[x]]), rep(0, len(gw))))
-        }
-    } else {
-        lb = 0
-    }
-    if (any(grepl("ub", colnames(gw$dt)))){
-        if (identical(grep("ub", colnames(gw$dt), value = TRUE), "ub")){
-            ub = c(rep(gw$dt$ub, ncol(e)), rep(Inf, ncol(K.unit)))
-        } else {
-            ub.cols = gsub("cn", "ub", cn.field)
-            if (!all(ub.cols %in% colnames(gw$dt))){
-                ub = Inf
-            }
-            ub = do.call("c", c(lapply(ub.cols, function(x) gw$dt[[x]]), rep(Inf, len(gw))))
-        }
-    } else {
-        ub = Inf
-    }
-    ## if (len(lb) != len(ub)){
-    ##     lb = rep(lb, length.out = pmax(len(lb), len(ub)))
-    ##     ub = rep(ub, length.out = pmax(len(lb), len(ub)))
-    ## }
-    ## TODO: implement lb and ub of walk CNs
-    sol = Rcplex::Rcplex(
-        cvec = c,
-        Amat = A,
-        bvec = b,
-        sense = sense, 
-        Qmat = NULL,
-        lb = lb,
-        ub = ub,
-        n = n.sol,
-        objsense = "min", 
-        vtype = vtype,
-        control = list(
-            trace = ifelse(verbose >= 1, 1, 0),
-            tilim = 100,
-            epgap = 1))
-    
-    if (!is.null(sol$xopt)) {
-        sol = list(sol)
-    }
-    if (length(sol) == 0) {
-        stop("No solutions found satisfying given constraints")
-    }
-    ## what is this rerun for???
-    ## to exhaust solutions
-    rerun = T
-    while (rerun) {
-        z = sign(vtype == "B")
-        P = do.call(rbind, lapply(sol, function(x) x$xopt * z))
-        p = rowSums(P) - 1
-        Ahat = rbind(A, P)
-        bhat = c(b, p)
-        sensehat = c(sense, rep("L", length(p)))
-        sol.new = Rcplex::Rcplex(cvec = c, Amat = Ahat, bvec = bhat, 
-            sense = sensehat, Qmat = NULL, lb = lb, ub = ub, 
-            n = n.sol, objsense = "min", vtype = vtype, control = list(trace = ifelse(verbose >= 
-                1, 1, 0), tilim = 100, epgap = 1))
-        if (length(sol.new) == 0) {
-            rerun = F
-        }
-        else {
-            sol = c(sol, sol.new)
-            if (length(sol) >= n.sol) {
-                sol = sol[1:n.sol]
-                rerun = F
-            }
-        }
-    }
-    ## return(sol)
-    ## for (cnm in cn.field){
-    ##     gw$set(eval(paste0("cn.", i)) = xmat[, cnm])
-    ## }
-    ## if (length(sol) > 1) {
-
-    if (return.gw){
-        for (i in seq_along(sol)){
-            this.sol = sol[[i]]
-            this.x = this.sol$xopt
-            this.cnf = rep(c(cn.field, "indicator"), each = len(gw))
-            this.ls = split(this.x, this.cnf)
-            names(this.ls) = paste(names(this.ls), i, sep = sep)
-            do.call(gw$set, this.ls)
-        }
-        return(invisible(gw))
-    } else {
-        return(sol)
-    }
-}
diff --git a/R/gGnome.R b/R/gGnome.R
index bebddfa1..af540797 100644
--- a/R/gGnome.R
+++ b/R/gGnome.R
@@ -6712,22 +6712,21 @@ gG = function(genome = NULL,
 }
 
 
-
 #' @name lengths
 #' @title lengths
 #' @description
 #' A vector of walk lengths associated with this walk
 #'
-#' @param gWalk a \code{gWalk} object
+#' @param x  \code{gWalk} object
+#' @param use.names
 #'
 #' @return the number of nodes in the gWalk
 #' @export
-`lengths.gWalk` = function(gWalk, use.names = FALSE){
-  return(gWalk$lengths)
+`lengths.gWalk` = function(gWalk, use.names = TRUE){
+    return(gWalk$lengths)
 }
 
 
-
 #' @name length
 #' @title length
 #' @description
@@ -7964,20 +7963,20 @@ gWalk = R6::R6Class("gWalk", ## GWALKS
                               vtype = ll$vtype
                           }
 
-                          sol = Rcplex::Rcplex(cvec = c,
-                                               Amat = A,
-                                               bvec = b,
-                                               sense = sense,
-                                               Qmat = NULL,
-                                               lb = 0,
-                                               ub = Inf,
-                                               n = n.sol,
-                                               objsense = "min",
-                                               vtype = vtype,
-                                               control = list(
-                                                   trace = ifelse(verbose>=1, 1, 0),
-                                                   tilim = 100,
-                                                   epgap = 1))
+                          sol = Rcplex2(cvec = c,
+                                        Amat = A,
+                                        bvec = b,
+                                        sense = sense,
+                                        Qmat = NULL,
+                                        lb = 0,
+                                        ub = Inf,
+                                        n = n.sol,
+                                        objsense = "min",
+                                        vtype = vtype,
+                                        control = list(
+                                            trace = ifelse(verbose>=1, 1, 0),
+                                            tilim = 100,
+                                            epgap = 1))
 
                           if (!is.null(sol$xopt)){
                               sol = list(sol)

From 7ef4cbdacf276d200eccebd2ccc032474dab27b0 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 3 May 2021 19:20:55 -0400
Subject: [PATCH 10/35] remove rcplex dependency

---
 R/gGnome.R | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/R/gGnome.R b/R/gGnome.R
index af540797..8dc636ca 100644
--- a/R/gGnome.R
+++ b/R/gGnome.R
@@ -4807,15 +4807,15 @@ gGraph = R6::R6Class("gGraph",
                              ## cap individual signed edge.ids to their capacity
                              ub[1:nrow(ed)] = pmin(ub[1:nrow(ed)], ed[[cfield]])
                            }
-                           sol = Rcplex::Rcplex(Amat = Amat,
-                                          lb = rep(0, ncol(Amat)),
-                                          ub = ub,
-                                          bvec = b$bvec,
-                                          sense = b$sense,
-                                          vtype = 'I', 
-                                          control = list(trace = verbose > 1),
-                                          objsense = ifelse(max, 'max', 'min'),
-                                          cvec = cvec)
+                           sol = Rcplex2(cvec = cvec,
+                                         Amat = Amat,
+                                         bvec = b$bvec,
+                                         lb = rep(0, ncol(Amat)),
+                                         ub = ub,
+                                         sense = b$sense,
+                                         vtype = 'I', 
+                                         control = list(trace = verbose > 1),
+                                         objsense = ifelse(max, 'max', 'min'))
 
                            ## find edges used in the solution
                            opted = ed[round(sol$xopt[1:.N])!=0, ]

From f1beeef7208e9392cbd89a8a137de809c1e11648 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 10 May 2021 10:24:24 -0400
Subject: [PATCH 11/35] add telomeric annotation to loose ends

---
 R/apps.R | 81 ++++++++++++++++----------------------------------------
 1 file changed, 23 insertions(+), 58 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 526415d2..6289c1fe 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -399,6 +399,16 @@ balance = function(gg,
     }
 
     if (ism) {
+
+        ## add telomeric annotation
+        qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
+        term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
+                    -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
+        term.out = -term.in ## out is reciprocal of in
+
+        ## annotate loose indicators with this
+        vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)]
+        
         ## if not phased, must add edge indicators (for just the ALT edges)
         if (!phased) {
             edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id)
@@ -593,52 +603,6 @@ balance = function(gg,
       message("adding delta constraints for LP")
     }
 
-    ## ## constrain deltas to be at least zero
-    ## delta.lbs = rbind(
-    ##   vars[type == "ndelta.minus", .(value = 1, id, cid = paste("ndelta.minus.lb", gid))],
-    ##   vars[type == "ndelta.plus", .(value = 1, id, cid = paste("ndelta.plus.lb", gid))],
-    ##   vars[type == "edelta.minus", .(value = 1, id, cid = paste("edelta.minus.lb", gid))],
-    ##   vars[type == "edelta.plus", .(value = 1, id, cid = paste("edelta.plus.lb", gid))],
-    ##   vars[type == "mdelta.minus", .(value = 1, id, cid = paste("mdelta.minus.lb", gid))],
-    ##   vars[type == "mdelta.plus", .(value = 1, id, cid = paste("mdelta.plus.lb", gid))]
-    ## )
-
-    ## delta.lbs.rhs = rbind(
-    ##   vars[type == "ndelta.minus", .(value = 0, sense = "G", cid = paste("ndelta.minus.lb", gid))],
-    ##   vars[type == "ndelta.plus", .(value = 0, sense = "G", cid = paste("ndelta.plus.lb", gid))],
-    ##   vars[type == "edelta.minus", .(value = 0, sense = "G", cid = paste("edelta.minus.lb", gid))],
-    ##   vars[type == "edelta.plus", .(value = 0, sense = "G", cid = paste("edelta.plus.lb", gid))],
-    ##   vars[type == "mdelta.minus", .(value = 0, sense = "G", cid = paste("mdelta.minus.lb", gid))],
-    ##   vars[type == "mdelta.plus", .(value = 0, sense = "G", cid = paste("mdelta.plus.lb", gid))]
-    ## )
-
-
-    ## constraints = rbind(constraints, delta.lbs, fill = TRUE)
-    ## b = rbind(b, delta.lbs.rhs, fill = TRUE)
-
-    ## ## add upper bound to prevent problem from becoming unbounded
-    ## ## constrain deltas to be at least zero
-    ## delta.ubs = rbind(
-    ##   vars[type == "ndelta.minus", .(value = 1, id, cid = paste("ndelta.minus.ub", gid))],
-    ##   vars[type == "ndelta.plus", .(value = 1, id, cid = paste("ndelta.plus.ub", gid))],
-    ##   vars[type == "edelta.minus", .(value = 1, id, cid = paste("edelta.minus.ub", gid))],
-    ##   vars[type == "edelta.plus", .(value = 1, id, cid = paste("edelta.plus.ub", gid))],
-    ##   vars[type == "mdelta.minus", .(value = 1, id, cid = paste("mdelta.minus.ub", gid))],
-    ##   vars[type == "mdelta.plus", .(value = 1, id, cid = paste("mdelta.plus.ub", gid))]
-    ## )
-
-    ## delta.ubs.rhs = rbind(
-    ##   vars[type == "ndelta.minus", .(value = M, sense = "L", cid = paste("ndelta.minus.ub", gid))],
-    ##   vars[type == "ndelta.plus", .(value = M, sense = "L", cid = paste("ndelta.plus.ub", gid))],
-    ##   vars[type == "edelta.minus", .(value = M, sense = "L", cid = paste("edelta.minus.ub", gid))],
-    ##   vars[type == "edelta.plus", .(value = M, sense = "L", cid = paste("edelta.plus.ub", gid))],
-    ##   vars[type == "mdelta.minus", .(value = M, sense = "L", cid = paste("mdelta.minus.ub", gid))],
-    ##   vars[type == "mdelta.plus", .(value = M, sense = "L", cid = paste("mdelta.plus.ub", gid))]
-    ## )
-
-    ## constraints = rbind(constraints, delta.ubs, fill = TRUE)
-    ## b = rbind(b, delta.ubs.rhs, fill = TRUE)
-
     vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)]
 
     ## add the residual constraints
@@ -802,9 +766,9 @@ balance = function(gg,
         if (!phased) {
             ## extremity exclusivity (relevant for ALL graphs)
             loose.constraints = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id) == 1,
+                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
                      .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id) == 1,
+                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
                      .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
             )
 
@@ -825,10 +789,11 @@ balance = function(gg,
             edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
             edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
 
+            ## TODO: add these as ub and lb instead of equality constraints
             loose.zeros = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
                      .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
                      .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
             )
 
@@ -841,9 +806,9 @@ balance = function(gg,
         if (phased) {
             ## homologous extremity exclusivity
             loose.constraints = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id)==1,
+                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
                      .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id)==1,
+                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
                      .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))]
             )
 
@@ -857,9 +822,9 @@ balance = function(gg,
             constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
 
             rhs = unique(rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id)==1,
+                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
                      .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id)==1,
+                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
                      .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))]
             ), by = "cid")
             
@@ -926,13 +891,13 @@ balance = function(gg,
                      .(value = 1, id, cid = paste("rhee", c4))],
 
                 ## loose indicators
-                vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(s),
+                vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE,
                      .(value = 1, id, cid = paste("rhee", s))],
-                vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(c),
+                vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE,
                      .(value = 1, id, cid = paste("rhee", c))],
-                vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(s),
+                vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE,
                      .(value = 1, id, cid = paste("rhee", s))],
-                vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c),
+                vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE,
                      .(value = 1, id, cid = paste("rhee", c))]
                 )
 

From c82a4e0a8df4a8b114e82286b9ab25146607af12 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 17 May 2021 10:33:36 -0400
Subject: [PATCH 12/35] add flag to allow cnloh in phased balance

---
 R/apps.R | 101 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 6289c1fe..b7e78bf8 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -55,6 +55,7 @@
 #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph   (FALSE)
 #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE
 #' @param ism  (logical) additional ISM constraints (FALSE)
+#' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE.
 #' @param lp (logical) solve as linear program using abs value (default TRUE)
 #' @param M  (numeric) big M constraint for L0 norm loose end penalty (default 1e3)
 #' @param verbose (integer)scalar specifying whether to do verbose output, value 2 will spit out MIP (1)
@@ -78,6 +79,7 @@ balance = function(gg,
                    M = 1e3,
                    phased = FALSE,
                    ism = FALSE,
+                   cnloh = FALSE,
                    lp = TRUE,
                    verbose = 1,
                    tilim = 10,
@@ -958,47 +960,72 @@ balance = function(gg,
 
         ## REF edge configuration constraint (added by default basically)
         ## only add this if there are no unphased nodes
-        iconstraints.from = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF",
-                 .(value = 1, id,
-                   edge.id = abs(sedge.id),
-                   snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
-                   cid = paste("ref.configuration.constraint.from", from))],
-            by = "edge.id"
-        )
+        if (cnloh) {
+            
+            ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2
+            iconstraints = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "REF",
+                     .(value = 1, id, edge.id = abs(sedge.id),
+                       cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+                by = "edge.id"
+            )
 
-        iconstraints.to = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF",
-                 .(value = 1, id,
-                   edge.id = abs(sedge.id),
-                   snode.id = to,
-                   cid = paste("ref.configuration.constraint.to", to))],
-            by = "edge.id"
-        )
+            rhs = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "REF",
+                     .(value = 2, sense = "L",
+                       cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+                by = "cid"
+            )
 
-        iconstraints = rbind(iconstraints.from, iconstraints.to)
+            constraints = rbind(constraints,
+                                iconstraints[, .(value, id, cid)],
+                                fill = TRUE)
+            b = rbind(b, rhs, fill = TRUE)
 
-        ## sum to at most 1 if phased, unconstrained if unphased
-        iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
-        
-        edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
-                                               .(value = 1, sense = "L", cid)],
-                                  by = "cid")
-## rbind(
-##             unique(iconstraints[allele %in% c("major", "minor"),
-##                                 .(value = 1, sense = "L", cid)], by = "cid"),
-##             unique(iconstraints[!(allele %in% c("major", "minor")),
-##                                 .(value = 2, sense = "L", cid)], by = "cid")
-##         )
+        } else {
+            
+            iconstraints.from = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "REF",
+                     .(value = 1, id,
+                       edge.id = abs(sedge.id),
+                       snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
+                       cid = paste("ref.configuration.constraint.from", from))],
+                by = "edge.id"
+            )
 
-        constraints = rbind(
-            constraints,
-            iconstraints[allele %in% c("major", "minor"),
-                         .(value, id, cid)],
-            fill = TRUE)
-        
-        ## add to b
-        b = rbind(b, edge.indicator.b, fill = TRUE)
+            iconstraints.to = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "REF",
+                     .(value = 1, id,
+                       edge.id = abs(sedge.id),
+                       snode.id = to,
+                       cid = paste("ref.configuration.constraint.to", to))],
+                by = "edge.id"
+            )
+
+            iconstraints = rbind(iconstraints.from, iconstraints.to)
+
+            ## sum to at most 1 if phased, unconstrained if unphased
+            iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
+            
+            edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
+                                                   .(value = 1, sense = "L", cid)],
+                                      by = "cid")
+            ## rbind(
+            ##             unique(iconstraints[allele %in% c("major", "minor"),
+            ##                                 .(value = 1, sense = "L", cid)], by = "cid"),
+            ##             unique(iconstraints[!(allele %in% c("major", "minor")),
+            ##                                 .(value = 2, sense = "L", cid)], by = "cid")
+            ##         )
+
+            constraints = rbind(
+                constraints,
+                iconstraints[allele %in% c("major", "minor"),
+                             .(value, id, cid)],
+                fill = TRUE)
+            
+            ## add to b
+            b = rbind(b, edge.indicator.b, fill = TRUE)
+        }
     }
   
   if (L0) ## add "big M" constraints

From ea54ad0f3e1f2f45c443621c67070f236118b812 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 24 May 2021 15:22:31 -0400
Subject: [PATCH 13/35] initial commit

---
 R/apps.R | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/R/apps.R b/R/apps.R
index 7c078269..05118202 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2917,3 +2917,16 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
         return(sol)
     }
 }
+
+#' @name parental.gg
+#' @title parental.gg
+#'
+#' @description
+#'
+#' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype
+#'
+#' @param gg (gGraph) input gGraph
+#' @param hfield (character) field of gGraph edge metadata corresponding with haplotype
+#' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5
+#' @param verbose (logical) default FALSE
+parental.gg = 

From 0dcc42a07fc11ea2b7d092b4484673c623eea642 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 24 May 2021 17:27:19 -0400
Subject: [PATCH 14/35] add working code for parental graph sim, add force flag
 to ALT edges in balance

---
 R/apps.R | 135 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 115 insertions(+), 20 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 05118202..de369114 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -55,6 +55,7 @@
 #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph   (FALSE)
 #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE
 #' @param ism  (logical) additional ISM constraints (FALSE)
+#' @param force.alt (logical) default true only applicable for phasing
 #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE.
 #' @param lp (logical) solve as linear program using abs value (default TRUE)
 #' @param M  (numeric) big M constraint for L0 norm loose end penalty (default 1e3)
@@ -79,6 +80,7 @@ balance = function(gg,
                    M = 1e3,
                    phased = FALSE,
                    ism = FALSE,
+                   force.alt = TRUE,
                    cnloh = FALSE,
                    lp = TRUE,
                    verbose = 1,
@@ -937,26 +939,28 @@ balance = function(gg,
 
         ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output)
         ## can become infeasible ...
-        iconstraints = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "ALT",
-                 .(value = 1, id, og.edge.id,
-                   edge.id = abs(sedge.id),
-                   cid = paste("edge.indicator.sum.lb", og.edge.id))],
-            by = "edge.id"
-        )
+        if (force.alt) {
+            iconstraints = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "ALT",
+                     .(value = 1, id, og.edge.id,
+                       edge.id = abs(sedge.id),
+                       cid = paste("edge.indicator.sum.lb", og.edge.id))],
+                by = "edge.id"
+            )
 
-        constraints = rbind(
-            constraints,
-            iconstraints[, .(value, id, cid)],
-            fill = TRUE)
+            constraints = rbind(
+                constraints,
+                iconstraints[, .(value, id, cid)],
+                fill = TRUE)
 
-        edge.indicator.b = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "ALT",
-                 .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))],
-            by = "cid"
-        )
+            edge.indicator.b = unique(
+                vars[type == "edge.indicator" & ref.or.alt == "ALT",
+                     .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))],
+                by = "cid"
+            )
 
-        b = rbind(b, edge.indicator.b, fill = TRUE)
+            b = rbind(b, edge.indicator.b, fill = TRUE)
+        }
 
         ## REF edge configuration constraint (added by default basically)
         ## only add this if there are no unphased nodes
@@ -2925,8 +2929,99 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
 #'
 #' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype
 #'
-#' @param gg (gGraph) input gGraph
-#' @param hfield (character) field of gGraph edge metadata corresponding with haplotype
+#' @param gg (gGraph) input gGraph. if desired can present haplotype field on edge metadata
 #' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5
+#' @param fix (logical) fix marginal in balance? default TRUE
 #' @param verbose (logical) default FALSE
-parental.gg = 
+#' @param epgap (numeric) default 1e-4
+#' @param tilim (numeric) default 60
+#' @param ... additional inputs to balance (e.g. epgap and whatnot)
+#'
+#' @return phased, balanced gGraph with og.node.id and allele annotation on nodes and og.edge.id annotation on edges
+parental.gg = function(gg,
+                       haplotype.frac = 0.5,
+                       verbose = FALSE,
+                       fix = 1,
+                       epgap = 1e-4,
+                       tilim = 60,
+                       ...) {
+
+    gg = gg$copy
+    
+    if (!("haplotype" %in% colnames(gg$edges$dt))) {
+
+        if (verbose) {
+            message("Assigning haplotypes with fraction ", haplotype.frac)
+        }
+
+        ## get number of ref and alt edges
+        n.alt = gg$edges$dt[type == "ALT", .N]
+        n.h1 = round(haplotype.frac * n.alt)
+        n.h2 = n.alt - n.h1
+        ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE)
+
+        ## mark haplotypes
+        gg$edges[type == "ALT"]$mark(haplotype = ht)
+    } else {
+        if (verbose) {
+            message("using pre-assigned haplotypes")
+        }
+    }
+
+    n.og.nodes = nrow(gg$nodes$dt)
+    new.nodes.dt = rbind(
+        gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h1", seqnames, start, end)],
+        gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 100)],
+        fill = TRUE
+    )
+
+    new.nodes.dt[, node.id := 1:.N]
+    new.nodes.dt[, allele := "unphased"]
+
+    new.edges.dt = rbind(
+        gg$edges$dt[type == "REF", .(og.edge.id = edge.id, n1, n1.side, n2, n2.side, type)],
+        gg$edges$dt[type == "REF", .(og.edge.id = edge.id,
+                                     n1 = n1 + n.og.nodes, n1.side,
+                                     n2 = n2 + n.og.nodes, n2.side, type)],
+        gg$edges$dt[type == "ALT" & haplotype == "h1",
+                    .(og.edge.id = edge.id,
+                      n1, n1.side,
+                      cn, weight = 1000,
+                      n2, n2.side, type)],
+        gg$edges$dt[type == "ALT" & haplotype == "h2",
+                    .(og.edge.id = edge.id,
+                      n1 = n1 + n.og.nodes,
+                      n1.side,
+                      n2 = n2 + n.og.nodes,
+                      n2.side,
+                      cn, weight = 1000,
+                      type)],
+        fill = TRUE
+    )
+
+    new.edges.dt[, connection := "straight"]
+
+    haplotype.gg = gG(nodes = dt2gr(new.nodes.dt), edges = new.edges.dt)
+
+    ## grab marginals...
+    marginal.gr = gg$nodes$gr[, "cn"]
+    marginal.gr$fix = fix
+
+    bal.gg = balance(haplotype.gg, marginal = marginal.gr, phased = TRUE, lp = TRUE, tilim = tilim, epgap = epgap, verbose = verbose, lambda = 10, ism = TRUE, force.alt = FALSE)
+
+    ## fix allele annotations
+    bal.nodes.dt = bal.gg$nodes$dt
+    bal.nodes.dt[, which.major := .SD$haplotype[which.max(.SD$cn)], by = og.node.id]
+    bal.nodes.dt[, allele := ifelse(haplotype == which.major, "major", "minor")]
+    bal.nodes.dt[, col := ifelse(allele == "major", alpha("red", 0.5), alpha("blue", 0.5))]
+
+    bal.gg$nodes$mark(allele = bal.nodes.dt$allele, col = bal.nodes.dt$col)
+    return(bal.gg)
+}
+    
+        
+
+    
+
+    
+        

From 640642800a800f194e87a0e49c829742599ad278 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Tue, 25 May 2021 10:18:08 -0400
Subject: [PATCH 15/35] fix balance code for edge marginals

---
 R/apps.R | 123 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 99 insertions(+), 24 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index de369114..b568b893 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -339,13 +339,16 @@ balance = function(gg,
             cartesian = TRUE,
             all.x = TRUE)$dt
         ## match this back with edge id and add this to vars
-        vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]]
+        ## vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]]
+        vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), subject.id]]
         ## add weight and target total CN
-        emtch = match(emarginal.id, junction.map$seen.by.emarginal)
-        emarginal = unique(
-            vars[type == "edge",][, type := "emresidual"][, cn := junction.map$cn[emtch]][, weight := junction.map$weight[emtch]][, fix := junction.map$fix[emtch]], ## lol change to merge
-            by = "emarginal.id")
-        vars = rbind(vars, emarginal, emresidual, fill = TRUE)
+        emarginal = merge(unique(
+            vars[type == "edge" & !is.na(emarginal.id),][, type := "emresidual"][, .(emarginal.id, sedge.id, lb = -M, ub = M, gid, type, vtype = "C", from, to)],
+            by = "emarginal.id"),
+            junction.map[, .(subject.id, weight, cn, fix)],
+            by.x = "emarginal.id",
+            by.y = "subject.id")
+        vars = rbind(vars, emarginal, fill = TRUE)
     }
 
     if (lp) {
@@ -380,8 +383,8 @@ balance = function(gg,
         ## add deltas for emresiduals if emarginals are supplied
         if (!is.null(emarginal)) {
             emdeltas = rbind(
-                vars[type == "emresidual", .(emarginal.id, weight, type = "emdelta.plus")][, gid := emarginal.id],
-                vars[type == "emresidual", .(emarginal.id, weight, type = "emdelta.minus")][, gid := emarginal.id]
+                vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.plus")][, gid := emarginal.id],
+                vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.minus")][, gid := emarginal.id]
             )
             vars = rbind(vars, emdeltas, fill = TRUE)
         }
@@ -2922,8 +2925,8 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
     }
 }
 
-#' @name parental.gg
-#' @title parental.gg
+#' @name parental
+#' @title parental
 #'
 #' @description
 #'
@@ -2933,18 +2936,22 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
 #' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5
 #' @param fix (logical) fix marginal in balance? default TRUE
 #' @param verbose (logical) default FALSE
+#' @param lambda (numeric) default 10
+#' @param eweight (numeric) edge weight default 1e3
 #' @param epgap (numeric) default 1e-4
 #' @param tilim (numeric) default 60
 #' @param ... additional inputs to balance (e.g. epgap and whatnot)
 #'
 #' @return phased, balanced gGraph with og.node.id and allele annotation on nodes and og.edge.id annotation on edges
-parental.gg = function(gg,
-                       haplotype.frac = 0.5,
-                       verbose = FALSE,
-                       fix = 1,
-                       epgap = 1e-4,
-                       tilim = 60,
-                       ...) {
+parental = function(gg,
+                    haplotype.frac = 0.5,
+                    fix = 1,
+                    verbose = FALSE,
+                    lambda = 10,
+                    eweight = 1e3,
+                    epgap = 1e-4,
+                    tilim = 60,
+                    ...) {
 
     gg = gg$copy
     
@@ -2971,7 +2978,7 @@ parental.gg = function(gg,
     n.og.nodes = nrow(gg$nodes$dt)
     new.nodes.dt = rbind(
         gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h1", seqnames, start, end)],
-        gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 100)],
+        gg$nodes$dt[, .(og.node.id = node.id, haplotype = "h2", seqnames, start, end, cn = 1, weight = 1)],
         fill = TRUE
     )
 
@@ -2983,18 +2990,18 @@ parental.gg = function(gg,
         gg$edges$dt[type == "REF", .(og.edge.id = edge.id,
                                      n1 = n1 + n.og.nodes, n1.side,
                                      n2 = n2 + n.og.nodes, n2.side, type)],
-        gg$edges$dt[type == "ALT" & haplotype == "h1",
+        gg$edges$dt[type == "ALT", #& haplotype == "h1",
                     .(og.edge.id = edge.id,
                       n1, n1.side,
-                      cn, weight = 1000,
+                      ## cn, weight = eweight,
                       n2, n2.side, type)],
-        gg$edges$dt[type == "ALT" & haplotype == "h2",
+        gg$edges$dt[type == "ALT", #& haplotype == "h2",
                     .(og.edge.id = edge.id,
                       n1 = n1 + n.og.nodes,
                       n1.side,
                       n2 = n2 + n.og.nodes,
                       n2.side,
-                      cn, weight = 1000,
+                      ## cn, weight = eweight,
                       type)],
         fill = TRUE
     )
@@ -3007,9 +3014,25 @@ parental.gg = function(gg,
     marginal.gr = gg$nodes$gr[, "cn"]
     marginal.gr$fix = fix
 
-    bal.gg = balance(haplotype.gg, marginal = marginal.gr, phased = TRUE, lp = TRUE, tilim = tilim, epgap = epgap, verbose = verbose, lambda = 10, ism = TRUE, force.alt = FALSE)
+    if (verbose) {
+        message("Starting balance")
+    }
+    bal.gg = balance(haplotype.gg,
+                     marginal = marginal.gr,
+                     emarginal = this.complex$junctions[type == "ALT"],
+                     phased = TRUE,
+                     lp = TRUE,
+                     tilim = tilim,
+                     epgap = epgap,
+                     verbose = verbose,
+                     lambda = lambda,
+                     ism = TRUE,
+                     force.alt = FALSE)
 
     ## fix allele annotations
+    if (verbose) {
+        message("Formatting output graph and relabeling alleles")
+    }
     bal.nodes.dt = bal.gg$nodes$dt
     bal.nodes.dt[, which.major := .SD$haplotype[which.max(.SD$cn)], by = og.node.id]
     bal.nodes.dt[, allele := ifelse(haplotype == which.major, "major", "minor")]
@@ -3020,7 +3043,59 @@ parental.gg = function(gg,
 }
     
         
-
+#' @name simulate.hets
+#' @title simulate.hets
+#'
+#' @description
+#'
+#' takes purity/ploidy 
+#' 
+#' @param gg (gGraph) phased balanced gGraph, such as from output of parental
+#' @param bins (GRanges) locations of heterozygous sites with metadata columns allele and count
+#' @param purity (numeric) default 1
+#' @param ploidy (numeric) default 2
+#' @param depth (numeric) (mean number of reads per site) default 50
+#' @param theta (numeric) NB parameter, positive, infinite gives Poisson, default 1. recommend on the same order of magnitude as depth.
+#'
+#' @return GRanges with metadata fields count and allele representing simulated read counts given supplied graph and parameters
+simulate.hets = function(gg,
+                         bins,
+                         purity = 1,
+                         ploidy = 2,
+                         depth = 50,
+                         theta = 50) {
+
+    if (!all(c("cn", "allele", "og.node.id") %in% colnames(gg$nodes$dt))) {
+        stop("gg nodes missing metadata 'cn' and 'allele'")
+    }
+    if (!all(c("count", "allele") %in% names(values(bins)))) {
+        stop("bins missing fields 'count' and 'allele'")
+    }
+    require(MASS)
+
+    unique.bins = unique(bins[, c()])
+    unique.bins$id = 1:length(unique.bins)
+
+    ## create data.table with absolute dosage at all snp sites
+    new.bins = c(unique.bins %$% gg$nodes[allele == "major"]$gr[, c("cn", "og.node.id", "allele")],
+                 unique.bins %$% gg$nodes[allele == "minor"]$gr[, c("cn", "og.node.id", "allele")]) %>%
+        as.data.table
+
+    ## calculate slope and intercept from purity, ploidy, depth
+    denom = 2 * (1 - purity) + purity * ploidy
+    beta = depth * purity / denom
+    gamma = depth * (1 - purity) / denom
+
+    ## inverse rel2abs transformation
+    new.bins[, mu := beta * cn + gamma]
+    new.bins[, count := rnegbin(mu, theta = theta)]
+
+    ## readjust so that major is always bigger than minor
+    new.bins[, which.major := .SD$allele[which.max(.SD$count)], by = id]
+    new.bins[, allele := ifelse(allele == which.major, "major", "minor")]
+    
+    return(dt2gr(new.bins[, .(seqnames, start, end, count, allele)]))
+}
     
 
     

From b72b967f1f3d69c1154bba7e8d1a3cca30824da3 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Tue, 25 May 2021 11:16:01 -0400
Subject: [PATCH 16/35] cleanup parental, expose fix.emarginal in simulate.hets

---
 R/apps.R | 58 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index b568b893..7dc4d817 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2933,8 +2933,9 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
 #' Converts an input unphased gGraph to a potential parental haplotype graph by randomly assigning ALT edges to a parental haplotype
 #'
 #' @param gg (gGraph) input gGraph. if desired can present haplotype field on edge metadata
-#' @param haplotype.frac (numeric) between 0-1, fraction assigned to one haplotype, default 0.5
 #' @param fix (logical) fix marginal in balance? default TRUE
+#' @param fix.emarginal (logical) fix edge marginal? default FALSE
+#' @param force.alt (logical) force incorporation of all junctions? default TRUE
 #' @param verbose (logical) default FALSE
 #' @param lambda (numeric) default 10
 #' @param eweight (numeric) edge weight default 1e3
@@ -2946,6 +2947,8 @@ fitcn = function (gw, cn.field = "cn", trim = TRUE, weight = NULL, obs.mat = NUL
 parental = function(gg,
                     haplotype.frac = 0.5,
                     fix = 1,
+                    fix.emarginal = 0,
+                    force.alt = TRUE,
                     verbose = FALSE,
                     lambda = 10,
                     eweight = 1e3,
@@ -2955,25 +2958,25 @@ parental = function(gg,
 
     gg = gg$copy
     
-    if (!("haplotype" %in% colnames(gg$edges$dt))) {
-
-        if (verbose) {
-            message("Assigning haplotypes with fraction ", haplotype.frac)
-        }
-
-        ## get number of ref and alt edges
-        n.alt = gg$edges$dt[type == "ALT", .N]
-        n.h1 = round(haplotype.frac * n.alt)
-        n.h2 = n.alt - n.h1
-        ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE)
-
-        ## mark haplotypes
-        gg$edges[type == "ALT"]$mark(haplotype = ht)
-    } else {
-        if (verbose) {
-            message("using pre-assigned haplotypes")
-        }
-    }
+    ## if (!("haplotype" %in% colnames(gg$edges$dt))) {
+
+    ##     if (verbose) {
+    ##         message("Assigning haplotypes with fraction ", haplotype.frac)
+    ##     }
+
+    ##     ## get number of ref and alt edges
+    ##     n.alt = gg$edges$dt[type == "ALT", .N]
+    ##     n.h1 = round(haplotype.frac * n.alt)
+    ##     n.h2 = n.alt - n.h1
+    ##     ht = sample(c(rep("h1", n.h1), rep("h2", n.h2)), size = n.alt, replace = FALSE)
+
+    ##     ## mark haplotypes
+    ##     gg$edges[type == "ALT"]$mark(haplotype = ht)
+    ## } else {
+    ##     if (verbose) {
+    ##         message("using pre-assigned haplotypes")
+    ##     }
+    ## }
 
     n.og.nodes = nrow(gg$nodes$dt)
     new.nodes.dt = rbind(
@@ -2990,18 +2993,16 @@ parental = function(gg,
         gg$edges$dt[type == "REF", .(og.edge.id = edge.id,
                                      n1 = n1 + n.og.nodes, n1.side,
                                      n2 = n2 + n.og.nodes, n2.side, type)],
-        gg$edges$dt[type == "ALT", #& haplotype == "h1",
+        gg$edges$dt[type == "ALT",
                     .(og.edge.id = edge.id,
                       n1, n1.side,
-                      ## cn, weight = eweight,
                       n2, n2.side, type)],
-        gg$edges$dt[type == "ALT", #& haplotype == "h2",
+        gg$edges$dt[type == "ALT",
                     .(og.edge.id = edge.id,
                       n1 = n1 + n.og.nodes,
                       n1.side,
                       n2 = n2 + n.og.nodes,
                       n2.side,
-                      ## cn, weight = eweight,
                       type)],
         fill = TRUE
     )
@@ -3014,12 +3015,17 @@ parental = function(gg,
     marginal.gr = gg$nodes$gr[, "cn"]
     marginal.gr$fix = fix
 
+    ## grab edge marginals
+    emarginals = this.complex$junctions[type == "ALT"]
+    emarginals$set(fix = fix.emarginal)
+    emarginals$set(weight = eweight)
+
     if (verbose) {
         message("Starting balance")
     }
     bal.gg = balance(haplotype.gg,
                      marginal = marginal.gr,
-                     emarginal = this.complex$junctions[type == "ALT"],
+                     emarginal = emarginals,
                      phased = TRUE,
                      lp = TRUE,
                      tilim = tilim,
@@ -3027,7 +3033,7 @@ parental = function(gg,
                      verbose = verbose,
                      lambda = lambda,
                      ism = TRUE,
-                     force.alt = FALSE)
+                     force.alt = force.alt)
 
     ## fix allele annotations
     if (verbose) {

From d0ec5a3eb3a5903c1c98aca42093242cee87a557 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Tue, 15 Jun 2021 11:36:48 -0400
Subject: [PATCH 17/35] update comments

---
 R/apps.R | 82 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 24 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index dd21b315..ed4f700c 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -55,7 +55,7 @@
 #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph   (FALSE)
 #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE
 #' @param ism  (logical) additional ISM constraints (FALSE)
-#' @param force.alt (logical) default true only applicable for phasing
+#' @param force.alt (logical) force incorporation of ALT edges, only applicable for phasing (default TRUE)
 #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE.
 #' @param lp (logical) solve as linear program using abs value (default TRUE)
 #' @param M  (numeric) big M constraint for L0 norm loose end penalty (default 1e3)
@@ -823,30 +823,56 @@ balance = function(gg,
 
         if (phased) {
             ## homologous extremity exclusivity
-            loose.constraints = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))]
-            )
-
-            edge.constraints = rbind(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
-                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))],
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
-                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))]
-            )
-
-            constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
-
-            rhs = unique(rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-                     .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-                     .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))]
-            ), by = "cid")
+            ## loose.constraints = rbind(
+            ##     vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
+            ##     vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))]
+            ## )
+
+            ## edge.constraints = rbind(
+            ##     vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
+            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))],
+            ##     vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
+            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))]
+            ## )
+
+            ## constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
+
+            ## rhs = unique(rbind(
+            ##     vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+            ##          .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))],
+            ##     vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+            ##          .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))]
+            ## ), by = "cid")
             
-            b = rbind(b, rhs, fill = TRUE)
+            ## b = rbind(b, rhs, fill = TRUE)
+
+            ## if (verbose) {
+            ##     message("Number of homologous extremity exclusivity constraints: ",
+            ##             nrow(rhs))
+            ## }
+
+            ## ## grab node ids associated with ALT edges on the left
+            ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
+            ##                      gg$edges$dt[n2.side == "left" & type == "ALT", n2])
+            ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
+            ##                        gg$edges$dt[n2.side == "right" & type == "ALT", n2])
+
+            ## ## fix loose ends for these nodes to zero
+            ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+
+            ## if (verbose) {
+            ##     message("Number of homologous loose ends: ",
+            ##             length(left.og.node.ids) + length(right.og.node.ids))
+            ## }
         
             ## reciprocal homologous extremity exclusivity
             ## implement configuration indicators (OR constraint)
@@ -919,8 +945,16 @@ balance = function(gg,
                      .(value = 1, id, cid = paste("rhee", c))]
                 )
 
+            ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge)
+            rhomol.constraints[, n.entries := .N, by = cid]
+            rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)]
+
             rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid")
 
+            if (verbose) {
+                message("Number of reciprocal homologous constraints: ", nrow(rhs))
+            }
+
             constraints = rbind(constraints, rhomol.constraints, fill = TRUE)
             b = rbind(b, rhs, fill = TRUE)
 

From 8851640725b97d49f14f68aceb1bd43881f8977e Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Thu, 24 Jun 2021 13:31:57 -0400
Subject: [PATCH 18/35] add basic code for cnloh to phased.binstats

---
 R/apps.R | 506 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 329 insertions(+), 177 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index ed4f700c..e143c116 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -55,6 +55,7 @@
 #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph   (FALSE)
 #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE
 #' @param ism  (logical) additional ISM constraints (FALSE)
+#' @param force.major (logical) force major allele CN to be >= minor allele CN (default FALSE)
 #' @param force.alt (logical) force incorporation of ALT edges, only applicable for phasing (default TRUE)
 #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE.
 #' @param lp (logical) solve as linear program using abs value (default TRUE)
@@ -82,6 +83,7 @@ balance = function(gg,
                    M = 1e3,
                    phased = FALSE,
                    ism = FALSE,
+                   force.major = FALSE,
                    force.alt = TRUE,
                    cnloh = FALSE,
                    lp = TRUE,
@@ -353,7 +355,7 @@ balance = function(gg,
         ## vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]]
         vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), subject.id]]
         ## add weight and target total CN
-        emarginal = merge(unique(
+        emarginal = merge.data.table(unique(
             vars[type == "edge" & !is.na(emarginal.id),][, type := "emresidual"][, .(emarginal.id, sedge.id, lb = -M, ub = M, gid, type, vtype = "C", from, to)],
             by = "emarginal.id"),
             junction.map[, .(subject.id, weight, cn, fix)],
@@ -516,11 +518,11 @@ balance = function(gg,
             vars[type == "loose.out.indicator" & snode.id > 0, ":="(c = cross.sedges$sedge.id[match(n1.full, cross.sedges$n1.full)])]
 
             ## merge this info into vars
-            vars = merge(vars,
-                         alt.sedges[, .(sedge.id, s1, s2, s3, s4, c1, c2, c3, c4)],
-                         by = "sedge.id",
-                         all.x = TRUE,
-                         all.y = FALSE)
+            vars = merge.data.table(vars,
+                                    alt.sedges[, .(sedge.id, s1, s2, s3, s4, c1, c2, c3, c4)],
+                                    by = "sedge.id",
+                                    all.x = TRUE,
+                                    all.y = FALSE)
 
         }
     }
@@ -781,98 +783,100 @@ balance = function(gg,
         }
 
         ## fix loose ends at zero if there's a junction there (only valid if not phasing)
-        if (!phased) {
+        #' zchoo Tuesday, Jun 15, 2021 11:53:15 AM
+        #' this constraint appears to be valid even if running phasing.
+        ## if (!phased) {
             ## extremity exclusivity (relevant for ALL graphs)
-            loose.constraints = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
-            )
+        loose.constraints = rbind(
+            vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
+            vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
+        )
 
-            edge.constraints = rbind(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))],
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))]
-            )
+        edge.constraints = rbind(
+            vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))],
+            vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))]
+        )
 
-            constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
+        constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
 
-            loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
-            edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
+        loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
+        edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
 
-            b = rbind(b, edge.b, loose.b, fill = TRUE)
+        b = rbind(b, edge.b, loose.b, fill = TRUE)
 
-            edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
-            edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
+        edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
+        edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
 
-            ## TODO: add these as ub and lb instead of equality constraints
-            loose.zeros = rbind(
-                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
-                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
-            )
+        loose.zeros = rbind(
+            vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
+            vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids & telomeric == FALSE,
+                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
+        )
 
-            loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
+        loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
 
-            constraints = rbind(constraints, loose.zeros, fill = TRUE)
-            b = rbind(b, loose.zeros.rhs, fill = TRUE)
-        }
+        constraints = rbind(constraints, loose.zeros, fill = TRUE)
+        b = rbind(b, loose.zeros.rhs, fill = TRUE)
+        ## }
 
         if (phased) {
             ## homologous extremity exclusivity
-            ## loose.constraints = rbind(
-            ##     vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
-            ##     vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))]
-            ## )
-
-            ## edge.constraints = rbind(
-            ##     vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
-            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))],
-            ##     vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
-            ##          .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))]
-            ## )
-
-            ## constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
-
-            ## rhs = unique(rbind(
-            ##     vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-            ##          .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))],
-            ##     vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
-            ##          .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))]
-            ## ), by = "cid")
+            ## this is actually redundant with previous constraints
+            loose.constraints = rbind(
+                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
+                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))]
+            )
+
+            edge.constraints = rbind(
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
+                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))],
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1,
+                     .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))]
+            )
+
+            constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
+
+            rhs = unique(rbind(
+                vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+                     .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))],
+                vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE,
+                     .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))]
+            ), by = "cid")
             
-            ## b = rbind(b, rhs, fill = TRUE)
-
-            ## if (verbose) {
-            ##     message("Number of homologous extremity exclusivity constraints: ",
-            ##             nrow(rhs))
-            ## }
-
-            ## ## grab node ids associated with ALT edges on the left
-            ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
-            ##                      gg$edges$dt[n2.side == "left" & type == "ALT", n2])
-            ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
-            ##                        gg$edges$dt[n2.side == "right" & type == "ALT", n2])
-
-            ## ## fix loose ends for these nodes to zero
-            ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-
-            ## if (verbose) {
-            ##     message("Number of homologous loose ends: ",
-            ##             length(left.og.node.ids) + length(right.og.node.ids))
-            ## }
+            b = rbind(b, rhs, fill = TRUE)
+
+            if (verbose) {
+                message("Number of homologous extremity exclusivity constraints: ",
+                        nrow(rhs))
+            }
+
+            ## grab node ids associated with ALT edges on the left
+            left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
+                                 gg$edges$dt[n2.side == "left" & type == "ALT", n2])
+            right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
+                                   gg$edges$dt[n2.side == "right" & type == "ALT", n2])
+
+            ## fix loose ends for these nodes to zero
+            vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
+                 ":="(lb = 0, ub = 0)]
+            vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
+                 ":="(lb = 0, ub = 0)]
+            vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
+                 ":="(lb = 0, ub = 0)]
+            vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
+                 ":="(lb = 0, ub = 0)]
+
+            if (verbose) {
+                message("Number of homologous loose ends: ",
+                        length(left.og.node.ids) + length(right.og.node.ids))
+            }
         
             ## reciprocal homologous extremity exclusivity
             ## implement configuration indicators (OR constraint)
@@ -985,8 +989,28 @@ balance = function(gg,
 
         b = rbind(b, edge.indicator.b, fill = TRUE)
 
+        ## force major allele to have higher CN than minor allele
+        ## may not work for phased blocks
+        if (force.major) {
+
+            iconstraints = rbind(
+                vars[type == "node" & allele == "major" & snode.id > 0,
+                     .(value = 1, id, cid = paste("force.major", og.node.id))],
+                vars[type == "node" & allele == "minor" & snode.id > 0,
+                     .(value = -1, id, cid = paste("force.major", og.node.id))])
+
+            rhs = unique(vars[type == "node" & snode.id > 0 & allele == "major",
+                              .(value = 0, sense = "G", cid = paste("force.major", og.node.id))],
+                         by = "cid")
+
+            constraints = rbind(constraints, iconstraints, fill = TRUE)
+            b = rbind(b, rhs, fill = TRUE)
+            
+        }
+                
+
         ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output)
-        ## can become infeasible ...
+        ## can become infeasible if original graph is not compatible with ISM
         if (force.alt) {
             iconstraints = unique(
                 vars[type == "edge.indicator" & ref.or.alt == "ALT",
@@ -1015,69 +1039,82 @@ balance = function(gg,
         if (cnloh) {
             
             ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2
-            iconstraints = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "REF",
-                     .(value = 1, id, edge.id = abs(sedge.id),
-                       cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
-                by = "edge.id"
-            )
+            ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges
+
+            if (!is.null(gg$edges$dt$cnloh)) {
+                cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique
+                if (verbose) {
+                    message("Number of allowed CNLOH sites: ", length(cnloh.og.edges))
+                }
+            } else {
+                warning("CNLOH not specified on edges. Allowing everywhere!")
+                cnloh.og.edges = gg$edges$dt$og.edge.id %>% unique
+            }
+        } else {
 
-            rhs = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "REF",
-                     .(value = 2, sense = "L",
-                       cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
-                by = "cid"
-            )
+            cnloh.og.edges = c()
+            
+        }
 
-            constraints = rbind(constraints,
-                                iconstraints[, .(value, id, cid)],
-                                fill = TRUE)
-            b = rbind(b, rhs, fill = TRUE)
+        ## add CNLOH constraints for applicable edges
+        
+        iconstraints = unique(
+            vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
+                 .(value = 1, id, edge.id = abs(sedge.id),
+                   cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+            by = "edge.id"
+        )
 
-        } else {
-            
-            iconstraints.from = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "REF",
-                     .(value = 1, id,
-                       edge.id = abs(sedge.id),
-                       snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
-                       cid = paste("ref.configuration.constraint.from", from))],
-                by = "edge.id"
-            )
+        rhs = unique(
+            vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
+                 .(value = 2, sense = "L",
+                   cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+            by = "cid"
+        )
 
-            iconstraints.to = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "REF",
-                     .(value = 1, id,
-                       edge.id = abs(sedge.id),
-                       snode.id = to,
-                       cid = paste("ref.configuration.constraint.to", to))],
-                by = "edge.id"
-            )
+        constraints = rbind(constraints,
+                            iconstraints[, .(value, id, cid)],
+                            fill = TRUE)
+        b = rbind(b, rhs, fill = TRUE)
 
-            iconstraints = rbind(iconstraints.from, iconstraints.to)
+        ## add ISM constraints for other edges
+        
+        iconstraints.from = unique(
+            vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges),
+                 .(value = 1, id,
+                   edge.id = abs(sedge.id),
+                   snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
+                   cid = paste("ref.configuration.constraint.from", from))],
+            by = "edge.id"
+        )
 
-            ## sum to at most 1 if phased, unconstrained if unphased
-            iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
-            
-            edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
-                                                   .(value = 1, sense = "L", cid)],
-                                      by = "cid")
-            ## rbind(
-            ##             unique(iconstraints[allele %in% c("major", "minor"),
-            ##                                 .(value = 1, sense = "L", cid)], by = "cid"),
-            ##             unique(iconstraints[!(allele %in% c("major", "minor")),
-            ##                                 .(value = 2, sense = "L", cid)], by = "cid")
-            ##         )
+        iconstraints.to = unique(
+            vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges),
+                 .(value = 1, id,
+                   edge.id = abs(sedge.id),
+                   snode.id = to,
+                   cid = paste("ref.configuration.constraint.to", to))],
+            by = "edge.id"
+        )
+
+        iconstraints = rbind(iconstraints.from, iconstraints.to)
+
+        ## sum to at most 1 if phased, unconstrained if unphased
+        iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
+        
+        edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
+                                               .(value = 1, sense = "L", cid)],
+                                  by = "cid")
+
+        constraints = rbind(
+            constraints,
+            iconstraints[allele %in% c("major", "minor"),
+                         .(value, id, cid)],
+            fill = TRUE)
+        
+        ## add to b
+        b = rbind(b, edge.indicator.b, fill = TRUE)
 
-            constraints = rbind(
-                constraints,
-                iconstraints[allele %in% c("major", "minor"),
-                             .(value, id, cid)],
-                fill = TRUE)
-            
-            ## add to b
-            b = rbind(b, edge.indicator.b, fill = TRUE)
-        }
     }
   
   if (L0) ## add "big M" constraints
@@ -1229,7 +1266,7 @@ balance = function(gg,
             constraints,
             rbind(
                 ## match up vars and marginal by snode.id and populate coefficients
-                merge(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))],
+                merge.data.table(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))],
                 ## the residual is the difference between the sum and marginal cn
                 vars[type == 'mresidual' & rid %in% ov$rid, .(value = -1, id, cid = paste('mresidual', rid))],        
                 fill = TRUE),
@@ -1363,9 +1400,9 @@ balance = function(gg,
   vars$x = sol$x
 
   ## for debugging
-  ppc = function(x) (x %>% merge(vars, by = 'id') %>% merge(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep=  '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char]
+  ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep=  '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char]
   
-  ppv = function(x) {tmp = x %>% merge(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc}
+  ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc}
 
   .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))),
                                   sense)
@@ -2327,7 +2364,8 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
                                           og.node.id, marginal.cn, allele,
                                           var, nbins, weight, index, col,
                                           cn.old, cn, fix, ywid,
-                                          old.node.id = node.id)]) %>% gr.sort
+                                          old.node.id = node.id)],
+                         seqinfo = seqinfo(gg$nodes$gr)) %>% gr.sort
 
 
     ## reset edge endpoints
@@ -2359,7 +2397,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
     ##                            M = 1e3, ism = FALSE, verbose = verbose, epgap = 1e-4,
     ##                            marginal = NULL)
 
-    new.nodes.gr = inferLoose(new.nodes.gr, new.edges.dt)
+    new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt)
 
     postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt)
     postprocessed.gg$set(y.field = "cn")
@@ -2379,8 +2417,10 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
 #' @param purity (numeric)
 #' @param ploidy (numeric)
 #' @param count.field (str) field containing allele read counts (default count)
-#' @param allele.field (str) field for containing allele label for read counts (default allele)
+#' @param allele.field (str) field for containing major/minor allele label (default allele)
+#' @param haplotype.field (str) field containing h1/h2 label (default haplotype)
 #' @param phase.blocks (GRanges) GRanges containing phase blocks (e.g. from HAPCUT). default NULL.
+#' @param breaks (GRanges) extra breakpoints to introduce. default NULL.
 #' @param edge.phase.dt (data.table) with columns n1.major, n2.major, n1.minor, n2.minor and edge.id providing major/minor allele counts
 #' @param vbase.count.thres (int) number of variant base counts required to phase edges (default 5)
 #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9)
@@ -2391,8 +2431,11 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
 #' @return gGraph whose nodes are annotated with $cn.major, $cn.minor, $haplotype, and $weight fields
 #' @export
 phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
-                           count.field = "count", allele.field = "allele",
+                           count.field = "count",
+                           allele.field = "allele",
+                           haplotype.field = "haplotype",
                            phase.blocks = NULL,
+                           breaks = NULL,
                            edge.phase.dt = NULL,
                            vbase.count.thres = 5, vbase.prop.thres = 0.9,
                            min.bins = 3, min.var = 1e-3,
@@ -2427,12 +2470,17 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
             stop("phase.blocks must be GRanges")
         }
     }
+    if (!is.null(breaks)) {
+        if (!inherits(breaks, 'GRanges')) {
+            stop("breaks must be GRanges")
+        }
+    }
     if (!is.null(edge.phase.dt)) {
         if (!is.data.table(edge.phase.dt)) {
             warning("edge.phase.dt must be data.table. ignoring this input.")
             edge.phase.dt = NULL
         }
-        if (!all(c("edge.id", "n1.major", "n2.major", "n1.minor", "n2.minor") %in% colnames(edge.phase.dt))) {
+        if (!all(c("edge.id", "n1.h1", "n2.h1", "n1.h2", "n2.h2") %in% colnames(edge.phase.dt))) {
             warning("edge.phase.dt does not have the required columns")
             edge.phase.dt = NULL
         }
@@ -2456,6 +2504,45 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         return(cn)
     }
 
+    if (verbose) {
+        message("Checking for extra breakpoints")
+
+        if (!is.null(breaks) && length(breaks) > 0) {
+            ## get just the starting point and remove strand
+            br = gr.stripstrand(gr.start(breaks))
+
+            if (verbose) {
+                message("Number of new breaks: ", length(br))
+                message("Creating new gGraph with incorporated breaks...")
+            }
+
+            ## save old node IDs
+            old.nodes = gg$nodes$gr[, "node.id"]
+            old.nodes$unsplit.id = old.nodes$node.id
+
+            ## create new gGraph with additional breakpoints
+            new.nodes = gr.breaks(bps = br, query = gg$nodes$gr)
+            gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"])
+
+            ## create node map to find which nodes were split up
+            node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"])
+
+            ## identify which REF edges are internal to an OG node
+            edge.map = gg$edges$dt
+            edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, unsplit.nodes$node.id)]]
+            edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, unsplit.nodes$node.id)]]
+
+            internal.edges = edge.map[type == "REF" & n1.unsplit == n2.unsplit, edge.id]
+
+            ## mark CNLOH in gg
+            gg$edges[internal.edges]$mark(cnloh = TRUE)
+
+            if (verbose) {
+                message("Number of internal edges marked in parent graph: ", length(internal.edges))
+            }
+        }
+    }
+
     if (verbose) {
         message("Preparing phased gGraph nodes")
     }
@@ -2475,13 +2562,57 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
             message("Adding phase block information")
         }
         pblocks = phase.blocks[, c()]
-        pblocks$pblock = 1:length(pblocks) %>% as.character
-        og.nodes = og.nodes %$% pblocks[, "pblock"]
+
+        ## find index of the right and left phase blocks
+        left.pblock = gr.match(gr.start(og.nodes), pblocks)
+        right.pblock = gr.match(gr.end(og.nodes), pblocks)
+
+        ## annotate GRanges with L and R phase block indices
+        og.nodes$left.pblock = left.pblock
+        og.nodes$right.pblock = right.pblock
     }
 
-    phased.gg.nodes = c(og.nodes, og.nodes)
-    phased.gg.nodes$allele = c(rep("major", length(og.nodes)),
-                               rep("minor", length(og.nodes)))
+    if (haplotype.field %in% names(values(bins))) {
+        if (verbose) {
+            message("Adding haplotype information")
+        }
+
+        ## check that the entries in this field are valid
+        if (!all(values(bins)[[haplotype.field]] %in% c("h1", "h2"))) {
+            stop("values in haplotype.field must be either h1 or h2")
+        }
+
+        ## identify phase block of the left side of the node
+        pblock.map = gr.match(pblocks, bins[values(bins)[[allele.field]] == "major"])
+        pblock.major.haplotype = values(bins)[[haplotype.field]][pblock.map]
+        node.left.major.haplotype = pblock.major.haplotype[og.nodes$left.pblock]
+        node.right.major.haplotype = pblock.major.haplotype[og.nodes$right.pblock]
+
+        major.nodes = copy(og.nodes)
+        major.nodes$left.haplotype = node.left.major.haplotype
+        major.nodes$right.haplotype = node.right.major.haplotype
+        minor.nodes = copy(og.nodes)
+        minor.nodes$left.haplotype = ifelse(node.left.major.haplotype == "h1",
+                                            "h2",
+                                            "h1")
+        minor.nodes$right.haplotype = ifelse(node.right.major.haplotype == "h1",
+                                             "h2",
+                                             "h1")
+        
+        ## major.map = gr.match(og.nodes, bins[values(bins)[[allele.field]] == "major"])
+        ## major.nodes = copy(og.nodes)
+        ## major.nodes$haplotype = values(bins)[[haplotype.field]][major.map]
+        ## minor.nodes = copy(og.nodes)
+        ## minor.nodes$haplotype = ifelse(major.nodes$haplotype == "h1", "h2", "h1")
+
+        major.nodes$allele = "major"
+        minor.nodes$allele = "minor"
+        phased.gg.nodes = c(major.nodes, minor.nodes)
+    } else {
+        phased.gg.nodes = c(og.nodes, og.nodes)
+        phased.gg.nodes$allele = c(rep("major", length(og.nodes)),
+                                   rep("minor", length(og.nodes)))
+    }
 
     if (verbose) {
         message("Computing allele CN")
@@ -2549,50 +2680,71 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
 
     ## add phase block information to edges (for linked reads)
     if (!is.null(phase.blocks)) {
-        phased.gg.edges[, ":="(n1.pblock = phased.gg.nodes$pblock[n1],
-                           n2.pblock = phased.gg.nodes$pblock[n2])]
 
+        ## add haplotype information of edge endpoints
+        phased.gg.edges[, ":="(n1.haplotype = ifelse(n1.side == "left",
+                                                     phased.gg.nodes$left.haplotype[n1],
+                                                     phased.gg.nodes$right.haplotype[n1]),
+                               n2.haplotype = ifelse(n2.side == "left",
+                                                     phased.gg.nodes$left.haplotype[n2],
+                                                     phased.gg.nodes$right.haplotype[n2]))]
+
+        ## add phase block infromation of edge end points
+        phased.gg.edges[, ":="(n1.pblock = ifelse(n1.side == "left",
+                                                  phased.gg.nodes$left.pblock[n1],
+                                                  phased.gg.nodes$right.pblock[n1]),
+                               n2.pblock = ifelse(n2.side == "left",
+                                                  phased.gg.nodes$left.pblock[n2],
+                                                  phased.gg.nodes$right.pblock[n2]))]
+        
         ## fix cross REF edges to zero within phase blocks
-        phased.gg.edges[(n1.pblock == n2.pblock) & type == "REF" & connection == "cross",
-                    ":="(cn = 0, fix = 1)]
+        phased.gg.edges[(n1.pblock == n2.pblock) & type == "REF" & (n1.haplotype != n2.haplotype),
+                    ":="(ub = 0, lb = 0)]
+        
         if (verbose) {
             message("Number of REF cross edges within phased blocks: ",
-                    nrow(phased.gg.edges[(n1.pblock == n2.pblock) &
-                                         type == "REF" &
-                                         connection == "cross"]))
+                    phased.gg.edges[(n1.pblock == n2.pblock) &
+                                    type == "REF" &
+                                    (n1.haplotype != n2.haplotype), .N])
         }
     }
 
+    ## mark CNLOH edges
+    if (!is.null(gg$edges$dt$cnloh)) {
+        og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id]
+        phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE]
+    }
+
     ## identify phased edges (for linked reads)
     if (!is.null(edge.phase.dt)) {
 
         ## compute totals
-        ephase = edge.phase.dt[, .(edge.id, n1.major, n2.major, n1.minor, n2.minor,
-                                   n1.total = n1.major + n1.minor,
-                                   n2.total = n2.major + n2.minor)][
+        ephase = edge.phase.dt[, .(edge.id, n1.h1, n2.h1, n1.h2, n2.h2,
+                                   n1.total = n1.h1 + n1.h2,
+                                   n2.total = n2.h1 + n2.h2)][
                                        (n1.total > vbase.count.thres) | (n2.total > vbase.count.thres)]
 
         ## count fraction of reads corresponding with each allele
-        ephase[, n1.major.frac := n1.major / n1.total]
-        ephase[, n2.major.frac := n2.major / n2.total]
-        ephase[, n1.minor.frac := n1.minor / n1.total]
-        ephase[, n2.minor.frac := n2.minor / n1.total]
+        ephase[, n1.h1.frac := n1.h1 / n1.total]
+        ephase[, n2.h1.frac := n2.h1 / n2.total]
+        ephase[, n1.h2.frac := n1.h2 / n1.total]
+        ephase[, n2.h2.frac := n2.h2 / n1.total]
 
         ## set phase if passing proportion threshold (vbase.prop.thres)
-        ephase[n1.major.frac > vbase.prop.thres, n1.phase := "major"]
-        ephase[n1.minor.frac > vbase.prop.thres, n1.phase := "minor"]
-        ephase[n2.major.frac > vbase.prop.thres, n2.phase := "major"]
-        ephase[n2.minor.frac > vbase.prop.thres, n2.phase := "minor"]
+        ephase[n1.h1.frac > vbase.prop.thres, n1.phase := "h1"]
+        ephase[n1.h2.frac > vbase.prop.thres, n1.phase := "h2"]
+        ephase[n2.h1.frac > vbase.prop.thres, n2.phase := "h1"]
+        ephase[n2.h2.frac > vbase.prop.thres, n2.phase := "h2"]
 
         ## add phase information to edges data frame
         phased.gg.edges[, n1.phase := ephase$n1.phase[match(og.edge.id, ephase$edge.id)]]
         phased.gg.edges[, n2.phase := ephase$n2.phase[match(og.edge.id, ephase$edge.id)]]
 
         ## fix things to zero
-        phased.gg.edges[n1.phase == "major" & n1.allele == "minor", ":="(fix = 1, cn = 0)]
-        phased.gg.edges[n2.phase == "major" & n2.allele == "minor", ":="(fix = 1, cn = 0)]
-        phased.gg.edges[n1.phase == "minor" & n1.allele == "major", ":="(fix = 1, cn = 0)]
-        phased.gg.edges[n2.phase == "minor" & n2.allele == "major", ":="(fix = 1, cn = 0)]
+        phased.gg.edges[n1.phase == "h1" & n1.haplotype == "h2", ":="(ub = 0, lb = 0)]
+        phased.gg.edges[n2.phase == "h1" & n2.haplotype == "h2", ":="(ub = 0, lb = 0)]
+        phased.gg.edges[n1.phase == "h2" & n1.haplotype == "h1", ":="(ub = 0, lb = 0)]
+        phased.gg.edges[n2.phase == "h2" & n2.haplotype == "h1", ":="(ub = 0, lb = 0)]
 
         if (verbose) {
             message("Number of ALT edges with n1 side fixed: ", sum(!is.na(phased.gg.edges$n1.phase)))

From 8fa33e516662e20bf1c511aa1b9d4bc2c2fc206f Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Sat, 26 Jun 2021 10:26:15 -0400
Subject: [PATCH 19/35] fix bug in phased.binstats

---
 R/apps.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index e143c116..5584acd4 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -1047,8 +1047,8 @@ balance = function(gg,
                     message("Number of allowed CNLOH sites: ", length(cnloh.og.edges))
                 }
             } else {
-                warning("CNLOH not specified on edges. Allowing everywhere!")
-                cnloh.og.edges = gg$edges$dt$og.edge.id %>% unique
+                warning("CNLOH not specified on edges. Disallowing!")
+                cnloh.og.edges = c()
             }
         } else {
 
@@ -2427,6 +2427,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
 #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3)
 #' @param min.var (numeric) min allowable variance (default 0.1)
 #' @param verbose (bool) default TRUE for debugging
+#' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp
 #' @param mc.cores (int) number of cores
 #' @return gGraph whose nodes are annotated with $cn.major, $cn.minor, $haplotype, and $weight fields
 #' @export
@@ -2439,7 +2440,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                            edge.phase.dt = NULL,
                            vbase.count.thres = 5, vbase.prop.thres = 0.9,
                            min.bins = 3, min.var = 1e-3,
-                           verbose = TRUE, mc.cores = 8)
+                           verbose = TRUE, min.width = 1e6, mc.cores = 8)
 {
     if (verbose) {
         message("Checking inputs")
@@ -2527,12 +2528,18 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
             ## create node map to find which nodes were split up
             node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"])
 
+            ## create a list of short nodes (CNLOH not allowed here!)
+            short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id]
+
             ## identify which REF edges are internal to an OG node
             edge.map = gg$edges$dt
-            edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, unsplit.nodes$node.id)]]
-            edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, unsplit.nodes$node.id)]]
+            edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]]
+            edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]]
 
-            internal.edges = edge.map[type == "REF" & n1.unsplit == n2.unsplit, edge.id]
+            internal.edges = edge.map[type == "REF" &
+                                      n1.unsplit == n2.unsplit &
+                                      !(n1 %in% short.nodes) &
+                                      !(n2 %in% short.nodes), edge.id]
 
             ## mark CNLOH in gg
             gg$edges[internal.edges]$mark(cnloh = TRUE)

From be135594c811604cae9232733feb1b899c0bb3f2 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Thu, 1 Jul 2021 11:32:39 -0400
Subject: [PATCH 20/35] update phased.binstats and balance to penalize CNLOH
 and reduce false positives

---
 R/apps.R | 96 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 5584acd4..d68aa64e 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -1012,8 +1012,13 @@ balance = function(gg,
         ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output)
         ## can become infeasible if original graph is not compatible with ISM
         if (force.alt) {
+
+            if (ism) {
+                warning("Forcing ALT edges while running ISM can make some problems infeasible!")
+            }
+            
             iconstraints = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT",
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE,
                      .(value = 1, id, og.edge.id,
                        edge.id = abs(sedge.id),
                        cid = paste("edge.indicator.sum.lb", og.edge.id))],
@@ -1026,7 +1031,7 @@ balance = function(gg,
                 fill = TRUE)
 
             edge.indicator.b = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT",
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE,
                      .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))],
                 by = "cid"
             )
@@ -1041,46 +1046,58 @@ balance = function(gg,
             ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2
             ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges
 
+            ## penalize CNLOH edges
+
             if (!is.null(gg$edges$dt$cnloh)) {
                 cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique
                 if (verbose) {
                     message("Number of allowed CNLOH sites: ", length(cnloh.og.edges))
                 }
+
+                ## add CNLOH annotation to variables
+                ## browser()
+                vars[, cnloh := FALSE]
+                vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") &
+                     ref.or.alt == "ALT" & og.edge.id %in% cnloh.og.edges,
+                     ":="(cnloh = TRUE)]
+                
             } else {
                 warning("CNLOH not specified on edges. Disallowing!")
                 cnloh.og.edges = c()
+                vars[, cnloh := FALSE]
             }
         } else {
 
             cnloh.og.edges = c()
+            vars[, cnloh := FALSE]
             
         }
 
         ## add CNLOH constraints for applicable edges
         
-        iconstraints = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
-                 .(value = 1, id, edge.id = abs(sedge.id),
-                   cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
-            by = "edge.id"
-        )
-
-        rhs = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
-                 .(value = 2, sense = "L",
-                   cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
-            by = "cid"
-        )
-
-        constraints = rbind(constraints,
-                            iconstraints[, .(value, id, cid)],
-                            fill = TRUE)
-        b = rbind(b, rhs, fill = TRUE)
-
-        ## add ISM constraints for other edges
+        ## iconstraints = unique(
+        ##     vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
+        ##          .(value = 1, id, edge.id = abs(sedge.id),
+        ##            cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+        ##     by = "edge.id"
+        ## )
+
+        ## rhs = unique(
+        ##     vars[type == "edge.indicator" & ref.or.alt == "REF" & og.edge.id %in% cnloh.og.edges,
+        ##          .(value = 2, sense = "L",
+        ##            cid = paste("ref.configuration.constraint.cnloh", og.edge.id))],
+        ##     by = "cid"
+        ## )
+
+        ## constraints = rbind(constraints,
+        ##                     iconstraints[, .(value, id, cid)],
+        ##                     fill = TRUE)
+        ## b = rbind(b, rhs, fill = TRUE)
+
+        ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT)
         
         iconstraints.from = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges),
+            vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
                  .(value = 1, id,
                    edge.id = abs(sedge.id),
                    snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
@@ -1089,7 +1106,7 @@ balance = function(gg,
         )
 
         iconstraints.to = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF" & !(og.edge.id %in% cnloh.og.edges),
+            vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
                  .(value = 1, id,
                    edge.id = abs(sedge.id),
                    snode.id = to,
@@ -1377,6 +1394,16 @@ balance = function(gg,
         Qmat = NULL ## no Q if solving LP
     }
 
+    ## browser()
+    if (cnloh) {
+
+        if ("cnloh" %in% colnames(vars)) {
+            indices = which(vars$type == "edge.indicator" & !is.na(vars$cnloh) & vars$cnloh == TRUE)
+            cvec[indices] = lambda
+        }
+    }
+  
+
   lb = vars$lb
   ub = vars$ub
 
@@ -2542,11 +2569,14 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                                       !(n2 %in% short.nodes), edge.id]
 
             ## mark CNLOH in gg
+            gg$edges$mark(cnloh = FALSE)
             gg$edges[internal.edges]$mark(cnloh = TRUE)
 
             if (verbose) {
                 message("Number of internal edges marked in parent graph: ", length(internal.edges))
             }
+        } else {
+            gg$edges$mark(cnloh = FALSE)
         }
     }
 
@@ -2719,9 +2749,25 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     ## mark CNLOH edges
     if (!is.null(gg$edges$dt$cnloh)) {
         og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id]
-        phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE]
+
+        ## identify CNLOH cross edges and add extra edges that are ALT
+        phased.cnloh.edges = phased.gg.edges[og.edge.id %in% og.cnloh.edges & (n1.allele != n2.allele),]
+        phased.cnloh.edges[, ":="(cnloh = TRUE, type = "ALT", class = "CNLOH")]
+
+        ## phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE]
+        phased.gg.edges = rbind(phased.gg.edges, phased.cnloh.edges, fill = TRUE)
+
+        if (verbose) {
+            message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N])
+        }
     }
 
+    ## OLD: CNLOH was REF
+    ## if (!is.null(gg$edges$dt$cnloh)) {
+    ##     og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id]
+    ##     phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE]
+    ## }
+
     ## identify phased edges (for linked reads)
     if (!is.null(edge.phase.dt)) {
 

From 7c356adfab6d12fe1131e45f05ce6d548c7e6dd1 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 5 Jul 2021 15:16:23 -0400
Subject: [PATCH 21/35] update phased.postprocess to disjoin gGraph before
 compressing allelic balance nodes

---
 R/apps.R | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 118 insertions(+), 11 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index d68aa64e..715f0ad6 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2323,11 +2323,116 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
     }
     gg = gg$copy
 
+    ## identify unphased nodes to disjoin against
+    ## browser()
+
+    if (verbose) {
+        message("Disjoining input graph against unphased node GRanges")
+    }
+    
+    seed.dt = merge.data.table(gg$nodes$dt[allele == "major", .(seqnames, start, end, width,
+                                                                major.cn = cn, og.node.id,
+                                                                major.loose.left = loose.left,
+                                                                major.loose.right = loose.right,
+                                                                major.node.id = node.id)],
+                               gg$nodes$dt[allele == "minor", .(og.node.id, minor.cn = cn,
+                                                                minor.loose.left = loose.left,
+                                                                minor.loose.right = loose.right,
+                                                                minor.node.id = node.id)],
+                               by = "og.node.id")
+
+    ## check whether there is an allelic CN change on either the left or right
+    ## sort nodes by start and end. nodes should be sorted, but just in case
+    seed.dt = seed.dt %>% split(seed.dt$seqnames) %>% lapply(function(dt) {dt[order(start),]}) %>% rbindlist
+
+    ## check for major or minor allele CN change on the left/right
+    seed.dt[, major.prev := data.table::shift(major.cn, n = 1, type = "lag")]
+    seed.dt[, major.next := data.table::shift(major.cn, n = 1, type = "lead")]
+    seed.dt[, minor.prev := data.table::shift(minor.cn, n = 1, type = "lag")]
+    seed.dt[, minor.next := data.table::shift(minor.cn, n = 1, type = "lead")]
+
+    seed.dt[, major.left.cn := major.cn != major.prev]
+    seed.dt[, major.right.cn := major.cn != major.next]
+    seed.dt[, minor.left.cn := minor.cn != minor.prev]
+    seed.dt[, minor.right.cn := minor.cn != minor.next]
+
+    ## drop nodes with CN imbalance
+    seed.dt = seed.dt[major.cn == minor.cn,]
+
+    ## identify whether the major or minor node is joined to an ALT edge with non-zero CN
+    nonzero.alt.left = c(gg$edges$dt[type == "ALT" & cn > 0 & n1.side == "left", n1],
+                         gg$edges$dt[type == "ALT" & cn > 0 & n2.side == "left", n2])
+    nonzero.alt.right = c(gg$edges$dt[type == "ALT" & cn > 0 & n1.side == "right", n1],
+                          gg$edges$dt[type == "ALT" & cn > 0 & n2.side == "right", n2])
+
+    seed.dt[, ":="(major.alt.left = major.node.id %in% nonzero.alt.left,
+                   major.alt.right = major.node.id %in% nonzero.alt.right,
+                   minor.alt.left = minor.node.id %in% nonzero.alt.left,
+                   minor.alt.right = minor.node.id %in% nonzero.alt.right)]
+
+    ## label telomeric
+    sl = seqlengths(gg$nodes$gr)
+    seed.dt[, left.telomeric := start == 1]
+    seed.dt[, right.telomeric := end == sl[as.character(seqnames)]]
+
+    ## check whether the left and right sides are EITHER loose
+    seed.dt[, left.alt := (major.alt.left == TRUE | minor.alt.left == TRUE |
+                           major.loose.left == TRUE | minor.loose.left == TRUE) &
+            (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)]
+
+    seed.dt[, right.alt := (major.alt.right == TRUE | minor.alt.right == TRUE |
+                           major.loose.right == TRUE | minor.loose.right == TRUE) &
+            (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)]
+
+    ## shift the end points
+    seed.dt[left.alt == TRUE & width > 1, start := start + 1]
+    seed.dt[right.alt == TRUE & width > 1, end := end - 1]
+
+    ## create GRanges
+    seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)])
+
+    ## disjoin gGraph against this GRanges
+    gg = gg$disjoin(seed.gr, collapse = FALSE)
+
+    ## any new edges introduced have to be straight
+    gg$edges[is.na(connection)]$mark(connection = "straight")
+
+    ## fill in other metadata
+    n1 = gg$edges$dt[, n1]
+    n2 = gg$edges$dt[, n2]
+
+    ## borrow CN from surrounding nodes
+    n1.na = gg$edges$dt[is.na(cn), n1]
+    gg$edges[is.na(cn)]$mark(cn = gg$nodes$dt$cn[match(n1.na, gg$nodes$dt$node.id)])
+
+    ## fix loose end CNs
+    gg = gGnome:::loosefix(gg)
+
+    ## label n1/n2 allele and chromosome
+    gg$edges$mark(n1.allele = gg$nodes$dt$allele[match(n1, gg$nodes$dt$node.id)])
+    gg$edges$mark(n2.allele = gg$nodes$dt$allele[match(n2, gg$nodes$dt$node.id)])
+    gg$edges$mark(n1.chr = gg$nodes$dt$seqnames[match(n1, gg$nodes$dt$node.id)])
+    gg$edges$mark(n2.chr = gg$nodes$dt$seqnames[match(n2, gg$nodes$dt$node.id)])
+
+    ## reset og.node.ids and og.edge.ids
+    node.id.key = gg$nodes$dt[, .(seqnames, start, end, rg = paste0(seqnames, ":", start, "-", end), node.id)]
+    node.id.key[, rg := as.integer(factor(rg))]
+    gg$nodes$mark(og.node.id = node.id.key[, rg])
+
+    ## reset og edge ids
+    edge.id.key = gg$edges$dt[, .(n1, n1.side, n2, n2.side, type)]
+    edge.id.key[, ":="(n1.og = node.id.key$rg[match(n1, node.id.key$node.id)],
+                       n2.og = node.id.key$rg[match(n2, node.id.key$node.id)])]
+    edge.id.key[, rg := paste(n1.og, n2.og, n1.side, n2.side, type)]
+    edge.id.key[, rg := as.integer(factor(rg))]
+    gg$edges$mark(og.edge.id = edge.id.key[, rg])
+
+
     ## identify nodes without CN imbalance
     if (verbose) {
         message("Identifying nodes without CN imbalance")
     }
-    og.node.balance = gg$nodes$dt[, .(og.node.id, allele, cn)] %>%
+    og.node.balance = gg$nodes$dt[width > 1, .(og.node.id, allele, cn)] %>% ## filter by width to keep 1bp stubs
         dcast.data.table(og.node.id ~ allele, value.var = "cn")
 
     og.node.balance[, cn.imbalance := (major != minor)]
@@ -2387,12 +2492,14 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
     new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)]
 
     ## get nodes as GRanges
+    new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>%
+        lapply(function(dt) {dt[order(start),]}) %>% rbindlist
     new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end,
                                           og.node.id, marginal.cn, allele,
                                           var, nbins, weight, index, col,
                                           cn.old, cn, fix, ywid,
                                           old.node.id = node.id)],
-                         seqinfo = seqinfo(gg$nodes$gr)) %>% gr.sort
+                         seqinfo = seqinfo(gg$nodes$gr))
 
 
     ## reset edge endpoints
@@ -2405,24 +2512,24 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
     new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)]
     new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)]
 
-    new.edges.dt = new.edges.dt[cn > 0,]
+    ## label REF edges as straight or cross based on og.node.id
+    ## browser()
+    ## new.edges.dt[type == "REF" & cn > 0, length(unique(connection)), by = og.edge.id] %>% summary
+    new.edges.dt[type == "REF", orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id]
+
+    ## only keep REF edges in the correct orientation (regardless of CN)
+    ## only keep ALT edges with CN > 0
+    new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) |
+                                (type == "ALT" & cn > 0),]
 
     ## remove edge CN and fix
     if ("fix" %in% colnames(new.edges.dt)) {
         new.edges.dt$fix = NULL
     }
 
-    ## if ("cn" %in% colnames(new.edges.dt)) {
-    ##     new.edges.dt$cn = NULL
-    ## }
-    
-
     if (verbose) {
         message("Creating new gGraph")
     }
-    ## postprocessed.gg = balance(gG(nodes = new.nodes.gr, edges = new.edges.dt),
-    ##                            M = 1e3, ism = FALSE, verbose = verbose, epgap = 1e-4,
-    ##                            marginal = NULL)
 
     new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt)
 

From be1d2f328badaafaad93d9366bad2a499617edc6 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Wed, 14 Jul 2021 14:11:29 -0400
Subject: [PATCH 22/35] cnloh detection update

---
 R/apps.R | 112 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 45 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 715f0ad6..e51f38ba 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -1049,16 +1049,16 @@ balance = function(gg,
             ## penalize CNLOH edges
 
             if (!is.null(gg$edges$dt$cnloh)) {
-                cnloh.og.edges = gg$edges$dt[cnloh == TRUE, og.edge.id] %>% unique
+                cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique
                 if (verbose) {
-                    message("Number of allowed CNLOH sites: ", length(cnloh.og.edges))
+                    message("Number of marked CNLOH edges: ", length(cnloh.edges))
                 }
 
                 ## add CNLOH annotation to variables
                 ## browser()
                 vars[, cnloh := FALSE]
                 vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") &
-                     ref.or.alt == "ALT" & og.edge.id %in% cnloh.og.edges,
+                     ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges),
                      ":="(cnloh = TRUE)]
                 
             } else {
@@ -1400,6 +1400,8 @@ balance = function(gg,
         if ("cnloh" %in% colnames(vars)) {
             indices = which(vars$type == "edge.indicator" & !is.na(vars$cnloh) & vars$cnloh == TRUE)
             cvec[indices] = lambda
+
+            message("Number of penalized CNLOH edges: ", length(indices))
         }
     }
   
@@ -2378,11 +2380,13 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
     ## check whether the left and right sides are EITHER loose
     seed.dt[, left.alt := (major.alt.left == TRUE | minor.alt.left == TRUE |
                            major.loose.left == TRUE | minor.loose.left == TRUE) &
-            (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)]
+                  (left.telomeric == FALSE)]
+            ## (major.left.cn == TRUE | minor.left.cn == TRUE) & (left.telomeric == FALSE)]
 
     seed.dt[, right.alt := (major.alt.right == TRUE | minor.alt.right == TRUE |
                            major.loose.right == TRUE | minor.loose.right == TRUE) &
-            (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)]
+                  (right.telomeric == FALSE)]
+            ## (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)]
 
     ## shift the end points
     seed.dt[left.alt == TRUE & width > 1, start := start + 1]
@@ -2641,50 +2645,64 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
 
     if (verbose) {
         message("Checking for extra breakpoints")
+    }
 
-        if (!is.null(breaks) && length(breaks) > 0) {
-            ## get just the starting point and remove strand
-            br = gr.stripstrand(gr.start(breaks))
+    if (!is.null(breaks) && length(breaks) > 0) {
+        ## get just the starting point and remove strand
+        br = gr.stripstrand(gr.start(breaks))
 
-            if (verbose) {
-                message("Number of new breaks: ", length(br))
-                message("Creating new gGraph with incorporated breaks...")
-            }
+        if (verbose) {
+            message("Number of new breaks: ", length(br))
+            message("Creating new gGraph with incorporated breaks...")
+        }
 
-            ## save old node IDs
-            old.nodes = gg$nodes$gr[, "node.id"]
-            old.nodes$unsplit.id = old.nodes$node.id
+        ## save old node IDs
+        old.nodes = gg$nodes$gr[, "node.id"]
+        old.nodes$unsplit.id = old.nodes$node.id
 
-            ## create new gGraph with additional breakpoints
-            new.nodes = gr.breaks(bps = br, query = gg$nodes$gr)
-            gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"])
+        ## create new gGraph with additional breakpoints
+        new.nodes = gr.breaks(bps = br, query = gg$nodes$gr)
+        gg = gG(breaks = new.nodes, junctions = gg$junctions[type == "ALT"])
 
-            ## create node map to find which nodes were split up
-            node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"])
+        ## create node map to find which nodes were split up
+        node.map = as.data.table(gg$nodes$gr[, "node.id"] %$% old.nodes[, "unsplit.id"])
 
-            ## create a list of short nodes (CNLOH not allowed here!)
-            short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id]
+        ## create a list of short nodes (CNLOH not allowed here!)
+        short.nodes = as.data.table(gg$nodes$gr)[width < min.width, node.id]
 
-            ## identify which REF edges are internal to an OG node
-            edge.map = gg$edges$dt
-            edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]]
-            edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]]
+        ## identify which REF edges are internal to an OG node
+        edge.map = gg$edges$dt
+        edge.map[, n1.unsplit := node.map$unsplit.id[match(n1, node.map$node.id)]]
+        edge.map[, n2.unsplit := node.map$unsplit.id[match(n2, node.map$node.id)]]
 
-            internal.edges = edge.map[type == "REF" &
-                                      n1.unsplit == n2.unsplit &
-                                      !(n1 %in% short.nodes) &
-                                      !(n2 %in% short.nodes), edge.id]
+        internal.edges = edge.map[type == "REF" &
+                                  n1.unsplit == n2.unsplit &
+                                  !(n1 %in% short.nodes) &
+                                  !(n2 %in% short.nodes), edge.id]
 
-            ## mark CNLOH in gg
-            gg$edges$mark(cnloh = FALSE)
-            gg$edges[internal.edges]$mark(cnloh = TRUE)
+        ## mark CNLOH in gg
+        gg$edges$mark(cnloh = FALSE)
+        gg$edges[internal.edges]$mark(cnloh = TRUE)
 
-            if (verbose) {
-                message("Number of internal edges marked in parent graph: ", length(internal.edges))
-            }
-        } else {
-            gg$edges$mark(cnloh = FALSE)
+        if (verbose) {
+            message("Number of internal edges marked in parent graph: ", length(internal.edges))
         }
+    } else {
+        gg$edges$mark(cnloh = FALSE)
+    }
+
+    if (verbose) {
+        message("Marking pseudo-CNLOH")
+    }
+    ## mark pseudo-CNLOH?
+    pseudo.cnloh = gg$edges$dt[, .(edge.id, type)]
+    pseudo.cnloh[, span := gg$junctions$span]
+
+    pseudo.cnloh.edges = pseudo.cnloh[span < 1e6 & type == "ALT", edge.id]
+    gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE)
+
+    if (verbose) {
+        message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges))
     }
 
     if (verbose) {
@@ -2855,7 +2873,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
 
     ## mark CNLOH edges
     if (!is.null(gg$edges$dt$cnloh)) {
-        og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id]
+        og.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "REF", edge.id]
 
         ## identify CNLOH cross edges and add extra edges that are ALT
         phased.cnloh.edges = phased.gg.edges[og.edge.id %in% og.cnloh.edges & (n1.allele != n2.allele),]
@@ -2867,13 +2885,17 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         if (verbose) {
             message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N])
         }
-    }
 
-    ## OLD: CNLOH was REF
-    ## if (!is.null(gg$edges$dt$cnloh)) {
-    ##     og.cnloh.edges = gg$edges$dt[cnloh == TRUE, edge.id]
-    ##     phased.gg.edges[og.edge.id %in% og.cnloh.edges, cnloh := TRUE]
-    ## }
+        ## mark pseudo-cnloh edges in child graph
+        og.pseudo.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id]
+        phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), cnloh := TRUE]
+
+        if (verbose) {
+            message("Number of pseudo-CNLOH ALT edges marked: ",
+                    phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), .N])
+        }
+
+    }
 
     ## identify phased edges (for linked reads)
     if (!is.null(edge.phase.dt)) {

From d47a83d99421ed64742e514f52b8fe91835a45e5 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Tue, 20 Jul 2021 10:14:08 -0400
Subject: [PATCH 23/35] fix no ALT edges CNLOH bug

---
 R/apps.R | 51 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index f98dde42..b1db52f9 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2694,6 +2694,7 @@ phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1
 #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9)
 #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3)
 #' @param min.var (numeric) min allowable variance (default 0.1)
+#' @param max.span (numeric) max span before penalizing CNLOH
 #' @param verbose (bool) default TRUE for debugging
 #' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp
 #' @param mc.cores (int) number of cores
@@ -2708,6 +2709,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                            edge.phase.dt = NULL,
                            vbase.count.thres = 5, vbase.prop.thres = 0.9,
                            min.bins = 3, min.var = 1e-3,
+                           max.span = 1e6,
                            verbose = TRUE, min.width = 1e6, mc.cores = 8)
 {
     if (verbose) {
@@ -2824,17 +2826,50 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     if (verbose) {
         message("Marking pseudo-CNLOH")
     }
-    ## mark pseudo-CNLOH?
-    pseudo.cnloh = gg$edges$dt[, .(edge.id, type)]
-    pseudo.cnloh[, span := gg$junctions$span]
 
-    pseudo.cnloh.edges = pseudo.cnloh[span < 1e6 & type == "ALT", edge.id]
-    gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE)
+    ## pull intra-chromosomal ALT edges, because these are cnloh candidates
+    pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")]
+    pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt
+
+    if (nrow(pseudo.cnloh.junctions.dt)) {
+
+        ## grab span - limit to below max.span
+        pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span]
+
+        ## compute node overlap with shadow
+        node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr
+        pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap]
+
+        ## mark candidates
+        pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
+                                                         class == "DEL-like" &
+                                                         node.overlap.count <= 3, edge.id],
+                               pseudo.cnloh.junctions.dt[span < max.span &
+                                                         class == "INV-like" &
+                                                         node.overlap.count <= 2, edge.id],
+                               pseudo.cnloh.junctions.dt[span < max.span &
+                                                         class == "DUP-like" &
+                                                         node.overlap.count <= 1, edge.id])
+        
+        ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id]
+        gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE)
+
+        if (verbose) {
+            message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges))
+        }
+
+    } else {
+
+        gg$edges$mark(cnloh = FALSE)
+
+        if (verbose) {
+            message("No pseudo-CNLOH edges detected.")
+        }
 
-    if (verbose) {
-        message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges))
     }
 
+    
+
     if (verbose) {
         message("Preparing phased gGraph nodes")
     }
@@ -3016,7 +3051,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
             message("Number of CNLOH ALT edges added: ", phased.cnloh.edges[, .N])
         }
 
-        ## mark pseudo-cnloh edges in child graph
+        ## mark pseudo-cnloh edges in child graph and make sure these are all zero
         og.pseudo.cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id]
         phased.gg.edges[og.edge.id %in% og.pseudo.cnloh.edges & (n1.allele != n2.allele), cnloh := TRUE]
 

From 0122c0f6abb50ededde4da78cb8831e02b89dea7 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 26 Jul 2021 11:03:45 -0400
Subject: [PATCH 24/35] start new postprocessing fx

---
 R/apps.R | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index bccfd51f..cf13c61b 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -964,8 +964,9 @@ balance = function(gg,
                 )
 
             ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge)
-            rhomol.constraints[, n.entries := .N, by = cid]
-            rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)]
+            ## rhomol.constraints[, n.entries := .N, by = cid]
+            ## remove this filter! due to some loose end violations!
+            ## rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)]
 
             rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid")
 
@@ -2423,6 +2424,40 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity,
   return(gg)
 }
 
+#' @name find_na_ranges
+#' @title find_na_ranges
+#'
+#' @description
+#'
+#' Identify NA ranges in a phased gGraph that actually cannot be phased
+#'
+#' @param gg (gGraph) junction-balanced allelic graph
+#' @param min.bins (numeric) default 1
+#' @param verbose (logical) default FALSE
+#'
+#' @return GRanges representing NA ranges that phased graph can be disjoined against in phased.postprocess
+find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
+
+    if (!inherits(gg, "gGraph")) {
+        stop("gg is not gGraph")
+    }
+
+    if (is.null(gg$nodes$dt$nbins)) {
+        stop("gg nodes must have metadata $nbins")
+    }
+    
+    gg.nodes.gr = gg$nodes$gr[, c("nbins", "node.id")]
+    gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins
+
+    ## grab left and right endpoints
+    browser()
+    na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE)
+    na.nodes.gr = gr.reduce(gg.nodes.gr, by = "na.node")
+
+    
+}
+
+
 #' @name phased.postprocess
 #' @title phased.postprocess
 #' @description
@@ -2430,13 +2465,16 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity,
 #' Postprocess junction-balanced phased graph and creates unphased regions
 #' This identifies regions without allelic CN imbalance
 #'
-#' @param gg junction-balanced phased gGraph. each node must have associated og node.id
+#' @param gg junction-balanced phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn
+#' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1
 #' @param phase.blocks (GRanges) granges of phase blocks from linked reads. default = NULL
 #' @param mc.cores (int) number of cores. default = 8.
 #' @param verbose (bool) verbose > 0 prints stuff. default 1.
 #'
+#' @return balanced gGraph with unphased nodes marked and compressed
+#'
 #' @export
-phased.postprocess = function(gg, phase.blocks = NULL, mc.cores = 8, verbose = 1)
+phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = 8, verbose = 1)
 {
     ## check that gg nodes and edges have og node
     if (!("og.node.id" %in% colnames(gg$nodes$dt)) | !("og.edge.id" %in% colnames(gg$edges$dt))) {

From ca025417bdcb38b6ee9f82d6f7e7d84c955ce839 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 26 Jul 2021 17:03:57 -0400
Subject: [PATCH 25/35] bug fixes in postprocess

---
 R/apps.R | 144 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 118 insertions(+), 26 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index cf13c61b..1c249684 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2446,18 +2446,97 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
         stop("gg nodes must have metadata $nbins")
     }
     
-    gg.nodes.gr = gg$nodes$gr[, c("nbins", "node.id")]
+    gg.nodes.gr = gg$nodes$gr[, c("nbins")]
     gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins
 
-    ## grab left and right endpoints
-    browser()
+    ## reduce NA nodes
     na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE)
-    na.nodes.gr = gr.reduce(gg.nodes.gr, by = "na.node")
 
+    ## if there are magically not any of these, return
+    if (length(na.nodes.gr) == 0) {
+        return(GRanges())
+    }
+    
+    na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node")
+
+    ## grab left and right endpoints (use OG node id)
+    na.nodes.gr$left.node.id = gr.findoverlaps(gr.start(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id
+    na.nodes.gr$right.node.id = gr.findoverlaps(gr.end(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id
+
+    ## check if these ranges are flanked by a rearrangement
+    og.loose.dt = gg$nodes$dt[, .(loose.left = any(loose.left == TRUE, na.rm = TRUE), loose.right = any(loose.right == TRUE, na.rm = TRUE)), by = og.node.id]
+    og.alt.dt = rbind(gg$edges$dt[type == "ALT", .(og.node.id = gg$nodes$dt$og.node.id[n1], side = n1.side)], gg$edges$dt[type == "ALT", .(og.node.id = gg$nodes$dt$og.node.id[n2], side = n2.side)])[, .(alt.left = any(side == "left", na.rm = TRUE), alt.right = any(side == "right", na.rm = TRUE)), by = og.node.id]
+
+    ## annotate loose and ALTs
+    na.nodes.gr$loose.left = na.nodes.gr$left.node.id %in% og.loose.dt[loose.left == TRUE, og.node.id]
+    na.nodes.gr$loose.right = na.nodes.gr$right.node.id %in% og.loose.dt[loose.right == TRUE, og.node.id]
+    na.nodes.gr$alt.left = na.nodes.gr$left.node.id %in% og.alt.dt[alt.left == TRUE, og.node.id]
+    na.nodes.gr$alt.right = na.nodes.gr$right.node.id %in% og.alt.dt[alt.right == TRUE, og.node.id]
+
+    ## next check if ranges are neighboring a rearrangement
+    ## create a data table with left and right neighbor of every og node
+    neighbors.dt = unique(gg$edges$dt[type == "REF", .(left.neighbor = gg$nodes$dt$og.node.id[n1], right.neighbor = gg$nodes$dt$og.node.id[n2])], by = "left.neighbor")
+
+    ## annotate with og.node.id of left and right neighbor
+    na.nodes.gr$left.neighbor = neighbors.dt$left.neighbor[match(na.nodes.gr$left.node.id, neighbors.dt$right.neighbor)]
+    na.nodes.gr$right.neighbor = neighbors.dt$right.neighbor[match(na.nodes.gr$right.node.id, neighbors.dt$left.neighbor)]
+
+    ## check if the neighbors of each node are flanked by a loose end
+    na.nodes.gr$loose.left.neighbor = na.nodes.gr$left.neighbor %in% og.loose.dt[loose.right == TRUE, og.node.id]
+    na.nodes.gr$loose.right.neighbor = na.nodes.gr$right.neighbor  %in% og.loose.dt[loose.left == TRUE, og.node.id]
+
+    ## check if the neighbors of each node are flanked by an ALT edge
+    na.nodes.gr$alt.left.neighbor = na.nodes.gr$left.neighbor %in% og.alt.dt[alt.right == TRUE, og.node.id]
+    na.nodes.gr$alt.right.neighbor = na.nodes.gr$right.neighbor %in% og.alt.dt[alt.left == TRUE, og.node.id]
+
+    ## get marginal copy number of left and right neighbor
+    marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)]
+    na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
+
+    ## get major and marginal copy number of left and right endpoints
+    na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
+    na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
+
+    ## easier manipulation
+    na.nodes.dt = as.data.table(na.nodes.gr)
+
+    ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal)
+    na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),]
+
+    if (!nrow(na.nodes.dt)) {
+        return(GRanges())
+    }
+
+    ## remove LOH ranges
+    na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) |
+                              (left.marginal.cn != left.major.cn) |
+                              (right.neighbor.marginal.cn != right.neighbor.major.cn) |
+                              (right.marginal.cn != right.major.cn),]
     
+    if (!nrow(na.nodes.dt)) {
+        return(GRanges())
+    }
+
+    ## check if left/right are telomeric
+    na.nodes.dt[, left.telomeric := (start == 1)]
+    na.nodes.dt[, sl := seqlengths(gg$nodes$gr)[match(seqnames, names(seqlengths(gg$nodes$gr)))]]
+    na.nodes.dt[, right.telomeric := (end == sl)]
+
+    ## resize
+    na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE), end := end + 1]
+    na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE), start := start - 1]
+
+    ## filter by major CN (e.g. if left and right neighbors have LOH)
+    return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr)))
 }
 
 
+
 #' @name phased.postprocess
 #' @title phased.postprocess
 #' @description
@@ -2559,10 +2638,17 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     seed.dt[right.alt == TRUE & width > 1, end := end - 1]
 
     ## create GRanges
-    seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)])
+    seed.gr = dt2gr(seed.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr), seqinfo = seqinfo(gg$nodes$gr))
+
+    ## merge with NA ranges
+    if (verbose) {
+        message("Identifying NA ranges")
+        }
+    na.gr = find_na_ranges(gg, min.bins = min.bins)
+    all.seed.gr = gr.reduce(c(seed.gr, na.gr))
 
     ## disjoin gGraph against this GRanges
-    gg = gg$disjoin(seed.gr, collapse = FALSE)
+    gg = gg$disjoin(all.seed.gr, collapse = FALSE)
 
     ## any new edges introduced have to be straight
     gg$edges[is.na(connection)]$mark(connection = "straight")
@@ -2602,13 +2688,24 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     if (verbose) {
         message("Identifying nodes without CN imbalance")
     }
-    og.node.balance = gg$nodes$dt[width > 1, .(og.node.id, allele, cn)] %>% ## filter by width to keep 1bp stubs
-        dcast.data.table(og.node.id ~ allele, value.var = "cn")
+    ## browser()
+    og.node.balance = dcast.data.table(gg$nodes$dt[, .(og.node.id, allele, cn)], og.node.id ~ allele, value.var = "cn") %>% merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE)
 
     og.node.balance[, cn.imbalance := (major != minor)]
     og.node.balance[, cn.total := (major + minor)]
 
-    og.node.balance[, phased := ifelse(cn.imbalance == TRUE | cn.total == 0, TRUE, FALSE)]
+    og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)]
+
+    if (verbose) {
+        message("Identifying nodes in NA stretches")
+    }
+    ## browser()
+    na.node.ids = as.data.table(gg$nodes$gr[, c("og.node.id", "node.id")] %&% na.gr)[, og.node.id]
+
+    ## mark na nodes as unphased
+    og.node.balance[og.node.id %in% na.node.ids, phased := FALSE]
+
+    ## annotate
 
     if (verbose) {
         message("Number of potentially unphased nodes: ", sum(og.node.balance$phased == FALSE))
@@ -2649,15 +2746,11 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"]
 
     ## reset CN to total CN
-    new.nodes.dt[node.id %in% unphased.major.nodes,
-                 cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]]
+    ## browser()
+    new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]]
 
     ## fix the CN of all of these nodes
-    new.nodes.dt[, fix := 1]
-
-    ## create new data.table for edges
-    new.edges.dt = gg$edges$dt[!(n1 %in% unphased.minor.nodes) | !(n2 %in% unphased.minor.nodes),]
-
+    ## new.nodes.dt[, fix := 1]
 
     ## reformat nodes
     new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)]
@@ -2670,17 +2763,19 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
                                           var, nbins, weight, index, col,
                                           cn.old, cn, fix, ywid,
                                           old.node.id = node.id)],
-                         seqinfo = seqinfo(gg$nodes$gr))
-
+                         seqinfo = seqinfo(gg$nodes$gr),
+                         seqlengths = seqlengths(gg$nodes$gr))
 
+    ## create new data.table for edges
+    new.edges.dt = gg$edges$dt
+    
     ## reset edge endpoints
-    dt = gg$nodes$dt[og.node.id %in% unphased.og.nodes, .(og.node.id, allele, node.id)] %>%
-        dcast.data.table(og.node.id ~ allele, value.var = "node.id")
+    ## browser()
+    dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id")
     new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]]
     new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]]
 
     ## reset all edge endpoints to new node.ids
-
     new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)]
     new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)]
 
@@ -2694,17 +2789,14 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) |
                                 (type == "ALT" & cn > 0),]
 
-    ## remove edge CN and fix
-    if ("fix" %in% colnames(new.edges.dt)) {
-        new.edges.dt$fix = NULL
-    }
+    ## deduplicate edges
+    new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)]
 
     if (verbose) {
         message("Creating new gGraph")
     }
     new.nodes.gr = gGnome:::inferLoose(new.nodes.gr, new.edges.dt)
 
-
     postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt)
     postprocessed.gg$set(y.field = "cn")
     return(postprocessed.gg)

From 356b4532939ea8cc98f168b6c34091f288cfe140 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Thu, 29 Jul 2021 11:15:38 -0400
Subject: [PATCH 26/35] stash changes to apps.R

---
 R/apps.R | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 1c249684..e367fda9 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2696,6 +2696,9 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
 
     og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)]
 
+    ## mark these specifically as being allele-balanced
+    og.node.balance[, ab := phased == FALSE]
+
     if (verbose) {
         message("Identifying nodes in NA stretches")
     }
@@ -2739,14 +2742,20 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     unphased.minor.nodes = gg$nodes$dt[og.node.id %in% unphased.og.nodes & allele == "minor", node.id]
     unphased.major.nodes = gg$nodes$dt[og.node.id %in% unphased.og.nodes & allele == "major", node.id]
 
+    ## identify specifically allele-balanced nodes (vs. NA nodes)
+    ab.og.nodes = og.node.balance[ab == TRUE, og.node.id]
+    ab.major.nodes = gg$nodes$dt[og.node.id %in% ab.og.nodes & allele == "major", node.id]
+
     ## create new data.table for nodes
     new.nodes.dt = gg$nodes$dt[!(node.id %in% unphased.minor.nodes),]
 
     ## mark major nodes as unphased
     new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"]
 
+    ## mark allele-balanced nodes specifically
+    new.nodes.dt[node.id %in% ab.major.nodes, ab := TRUE]
+
     ## reset CN to total CN
-    ## browser()
     new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]]
 
     ## fix the CN of all of these nodes
@@ -2761,7 +2770,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end,
                                           og.node.id, marginal.cn, allele,
                                           var, nbins, weight, index, col,
-                                          cn.old, cn, fix, ywid,
+                                          cn.old, cn, fix, ywid, ab,
                                           old.node.id = node.id)],
                          seqinfo = seqinfo(gg$nodes$gr),
                          seqlengths = seqlengths(gg$nodes$gr))

From ff49d3e92adf3097970fc17838c393cd43753b57 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Wed, 4 Aug 2021 14:32:20 -0400
Subject: [PATCH 27/35] bug fix

---
 R/apps.R         | 29 +++++++++++++++++------------
 R/eventCallers.R |  6 +++++-
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 59754720..82fbdd32 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -935,21 +935,21 @@ balance = function(gg,
                        cid = paste("rhee", sedge.id))],
 
                 ## actual ALT edges
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s1),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s1),
                      .(value = 1, id, cid = paste("rhee", s1))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s2),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s2),
                      .(value = 1, id, cid = paste("rhee", s2))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s3),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s3),
                      .(value = 1, id, cid = paste("rhee", s3))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s4),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s4),
                      .(value = 1, id, cid = paste("rhee", s4))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c1),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c1),
                      .(value = 1, id, cid = paste("rhee", c1))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c2),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c2),
                      .(value = 1, id, cid = paste("rhee", c2))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c3),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c3),
                      .(value = 1, id, cid = paste("rhee", c3))],
-                vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c4),
+                vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c4),
                      .(value = 1, id, cid = paste("rhee", c4))],
 
                 ## loose indicators
@@ -1422,6 +1422,11 @@ balance = function(gg,
             message("Number of penalized CNLOH edges: ", length(indices))
         }
     }
+
+    ## check constraints of CNLOH
+    ## browser()
+    ## vars[type == "edge.indicator" & cnloh == TRUE]
+    ## vars[type == "edge.indicator" & cnloh == TRUE, .N, by = og.edge.id]
   
 
   lb = vars$lb
@@ -1505,16 +1510,16 @@ balance = function(gg,
       message("formatting phased graph...")
     }
     ## edge formatting
-    ref.edge.col = alpha("blue", 0.5)
-    alt.edge.col = alpha("red", 0.5)
-    ref.edge.lwd = 1.0
+    ref.edge.col = alpha("blue", 0.2)
+    alt.edge.col = alpha("red", 0.4)
+    ref.edge.lwd = 0.5
     alt.edge.lwd = 1.0
     edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col)
     edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd)
     gg$edges$mark(col = edge.col, lwd = edge.lwd)
 
     ## mark zero cn edges
-    zero.cn.col = alpha("gray", 0.1)
+    zero.cn.col = alpha("gray", 0)
     zero.cn.lwd = 0.5
     zero.cn.edges = which(gg$edges$dt$cn == 0)
     gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd)
diff --git a/R/eventCallers.R b/R/eventCallers.R
index 116be297..06c9fda4 100755
--- a/R/eventCallers.R
+++ b/R/eventCallers.R
@@ -2513,7 +2513,7 @@ amp = function(gg, jcn.thresh = 8, cn.thresh = 2, fbi.cn.thresh = 0.5,  n.jun.hi
     ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))]
     keep = (gg$nodes$dt$cn/ploidy) > cn.thresh
     gg$clusters(keep)
-    if (!any(!is.na(gg$nodes$dt$cluster)))
+
         return(gg)
 
   tiny = gg$edges$mark(tiny = gg$edges$dt$class %in% c('DEL-like', 'DUP-like') & gg$edges$span <1e4)
@@ -2535,6 +2535,10 @@ amp = function(gg, jcn.thresh = 8, cn.thresh = 2, fbi.cn.thresh = 0.5,  n.jun.hi
                n.jun = length(cl.edges),
                n.jun.high = sum(cl.edges$dt[, sum(cn > 3)]), 
                max.jcn = max(c(0, cl.edges$dt$cn)),
+               max.loose.cn = max(c(0,
+                                    cl.nodes$dt[, loose.cn.left],
+                                    cl.nodes$dt[, loose.cn.right]),
+                                  na.rm = TRUE),
                max.cn = max(cl.nodes$dt$cn),
                footprint = paste(gr.string(cl.nodes$footprint),
                                  collapse = ","))

From e216f0c06140737c070231b0ce0415435fdb846b Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Thu, 12 Aug 2021 23:40:10 -0400
Subject: [PATCH 28/35] latest with pre-balance collapsed nodes

---
 R/apps.R | 316 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 257 insertions(+), 59 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index e209c98f..10b0359e 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2438,10 +2438,11 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity,
 #'
 #' @param gg (gGraph) junction-balanced allelic graph
 #' @param min.bins (numeric) default 1
+#' @param min.width (exclude) very short ranges (default 5e3)
 #' @param verbose (logical) default FALSE
 #'
 #' @return GRanges representing NA ranges that phased graph can be disjoined against in phased.postprocess
-find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
+find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) {
 
     if (!inherits(gg, "gGraph")) {
         stop("gg is not gGraph")
@@ -2452,10 +2453,11 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
     }
     
     gg.nodes.gr = gg$nodes$gr[, c("nbins")]
-    gg.nodes.gr$na.node = is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins
+    gg.nodes.gr$na.node = (is.na(gg.nodes.gr$nbins) | gg.nodes.gr$nbins < min.bins)
 
     ## reduce NA nodes
     na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE)
+    ## browser()
 
     ## if there are magically not any of these, return
     if (length(na.nodes.gr) == 0) {
@@ -2464,6 +2466,15 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
     
     na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node")
 
+    ## na.nodes.gr = gr.val(na.nodes.gr, target = gg$nodes$gr[, "nbins"],
+    ##                      val = "nbins",
+    ##                      weighted = FALSE,
+    ##                      FUN = sum,
+    ##                      na.rm = TRUE)
+
+    ## only keep relatively large NA regions... for isolated small ROH we may not want to do this.
+    na.nodes.gr = na.nodes.gr %Q% (width(na.nodes.gr) > min.width)
+
     ## grab left and right endpoints (use OG node id)
     na.nodes.gr$left.node.id = gr.findoverlaps(gr.start(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id
     na.nodes.gr$right.node.id = gr.findoverlaps(gr.end(na.nodes.gr), gg$nodes$gr[, "og.node.id"], first = TRUE, scol = "og.node.id")$og.node.id
@@ -2494,36 +2505,36 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
     na.nodes.gr$alt.left.neighbor = na.nodes.gr$left.neighbor %in% og.alt.dt[alt.right == TRUE, og.node.id]
     na.nodes.gr$alt.right.neighbor = na.nodes.gr$right.neighbor %in% og.alt.dt[alt.left == TRUE, og.node.id]
 
-    ## get marginal copy number of left and right neighbor
-    marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)]
-    na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
+    ## ## get marginal copy number of left and right neighbor
+    ## marginal.cn.dt = gg$nodes$dt[allele == "major", .(og.node.id, marginal.cn, major.cn = cn)]
+    ## na.nodes.gr$left.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$right.neighbor.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$left.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.neighbor, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$right.neighbor.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.neighbor, marginal.cn.dt$og.node.id)]
 
-    ## get major and marginal copy number of left and right endpoints
-    na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
-    na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
+    ## ## get major and marginal copy number of left and right endpoints
+    ## na.nodes.gr$left.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$right.marginal.cn = marginal.cn.dt$marginal.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$left.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$left.node.id, marginal.cn.dt$og.node.id)]
+    ## na.nodes.gr$right.major.cn = marginal.cn.dt$major.cn[match(na.nodes.gr$right.node.id, marginal.cn.dt$og.node.id)]
 
-    ## easier manipulation
+    ## ## easier manipulation
     na.nodes.dt = as.data.table(na.nodes.gr)
 
-    ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal)
-    #' zchoo Friday, Jul 30, 2021 11:28:55 AM
-    ## removed this filter
-    ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),]
+    ## ## filter by marginal CN (e.g. if left and right neighbors have to have the same marginal)
+    ## #' zchoo Friday, Jul 30, 2021 11:28:55 AM
+    ## ## removed this filter
+    ## ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn == right.neighbor.marginal.cn) | is.na(left.marginal.cn) | is.na(right.marginal.cn),]
 
-    if (!nrow(na.nodes.dt)) {
-        return(GRanges())
-    }
+    ## if (!nrow(na.nodes.dt)) {
+    ##     return(GRanges())
+    ## }
 
-    ## remove LOH ranges
-    na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) |
-                              (left.marginal.cn != left.major.cn) |
-                              (right.neighbor.marginal.cn != right.neighbor.major.cn) |
-                              (right.marginal.cn != right.major.cn),]
+    ## ## remove LOH ranges
+    ## na.nodes.dt = na.nodes.dt[(left.neighbor.marginal.cn != left.neighbor.major.cn) |
+    ##                           (left.marginal.cn != left.major.cn) |
+    ##                           (right.neighbor.marginal.cn != right.neighbor.major.cn) |
+    ##                           (right.marginal.cn != right.major.cn),]
     
     if (!nrow(na.nodes.dt)) {
         return(GRanges())
@@ -2535,14 +2546,144 @@ find_na_ranges = function(gg, min.bins = 1, verbose = FALSE) {
     na.nodes.dt[, right.telomeric := (end == sl)]
 
     ## resize
-    na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE), end := end + 1]
-    na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE), start := start - 1]
+    na.nodes.dt[right.telomeric == FALSE & (alt.right.neighbor == TRUE | loose.right.neighbor == TRUE),
+                end := end + 1]
+    na.nodes.dt[left.telomeric == FALSE & (alt.left.neighbor == TRUE | loose.left.neighbor == TRUE),
+                start := start - 1]
 
     ## filter by major CN (e.g. if left and right neighbors have LOH)
     return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr)))
 }
 
+#' @name unphase_na_ranges
+#' @title unphase_na_ranges
+#'
+#' @description
+#'
+#' collapse NA ranges before balance
+#' 
+#' @param gg phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn, weight
+#' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1
+#' @param min.wdith (numeric)
+#' @param phase.blocks (GRanges) ## NOT IMPLEMENTED YET
+#' @param verbose (logical) verbose > 0 prints stuff. default 1.
+#'
+#' @return partially phased gGraph with NA ranges collapsed, and CN fixed to marginal.
+unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = NULL, verbose = 1)
+{
+    ## make a copy of balanced graph to prevent mutation
+    if (verbose) {
+        message("Making a copy of input gGraph")
+    }
+    gg = gg$copy
+
+    if (verbose) {
+        message("Disjoining input graph against unphased node GRanges")
+    }
+    
+    ## check whether there is an allelic CN change on either the left or right
+    ## merge with NA ranges
+    if (verbose) {
+        message("Identifying NA ranges")
+        }
+    na.gr = find_na_ranges(gg, min.bins = min.bins, min.width = min.width)
+
+    ## disjoin gGraph against this GRanges
+    gg = gg$disjoin(na.gr, collapse = FALSE)
+
+    ## any new edges introduced have to be straight
+    gg$edges[is.na(connection)]$mark(connection = "straight")
+
+    ## fill in other metadata
+    n1 = gg$edges$dt[, n1]
+    n2 = gg$edges$dt[, n2]
+
+    ## label n1/n2 allele and chromosome
+    gg$edges$mark(n1.allele = gg$nodes$dt$allele[match(n1, gg$nodes$dt$node.id)])
+    gg$edges$mark(n2.allele = gg$nodes$dt$allele[match(n2, gg$nodes$dt$node.id)])
+    gg$edges$mark(n1.chr = gg$nodes$dt$seqnames[match(n1, gg$nodes$dt$node.id)])
+    gg$edges$mark(n2.chr = gg$nodes$dt$seqnames[match(n2, gg$nodes$dt$node.id)])
 
+    ## reset og.node.ids and og.edge.ids
+    node.id.key = gg$nodes$dt[, .(seqnames, start, end, rg = paste0(seqnames, ":", start, "-", end), node.id)]
+    node.id.key[, rg := as.integer(factor(rg))]
+    gg$nodes$mark(og.node.id = node.id.key[, rg])
+
+    ## reset og edge ids
+    edge.id.key = gg$edges$dt[, .(n1, n1.side, n2, n2.side, type)]
+    edge.id.key[, ":="(n1.og = node.id.key$rg[match(n1, node.id.key$node.id)],
+                       n2.og = node.id.key$rg[match(n2, node.id.key$node.id)])]
+    edge.id.key[, rg := paste(n1.og, n2.og, n1.side, n2.side, type)]
+    edge.id.key[, rg := as.integer(factor(rg))]
+    gg$edges$mark(og.edge.id = edge.id.key[, rg])
+
+    if (verbose) {
+        message("Identifying nodes in NA stretches")
+    }
+    ov = gg$nodes$gr[, c("og.node.id", "node.id", "allele")] %&% na.gr
+    ov.dt = as.data.table(ov)[, .(node.id, allele, og.node.id)]
+
+    ## identify corresponding minor/major nodes
+    unphased.major.nodes = ov.dt[allele == "major", node.id]
+    unphased.minor.nodes = ov.dt[allele == "minor", node.id]
+
+    ## create new data.table for nodes
+    new.nodes.dt = gg$nodes$dt[!(node.id %in% unphased.minor.nodes),]
+
+    ## mark major nodes as unphased
+    new.nodes.dt[node.id %in% unphased.major.nodes, allele := "unphased"]
+
+    ## reset CN to total CN
+    new.nodes.dt[node.id %in% unphased.major.nodes, cn := marginal.cn]
+    new.nodes.dt[node.id %in% unphased.major.nodes, fix := 1]
+
+    ## reformat color
+    new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)]
+
+    ## get nodes as GRanges
+    new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>%
+        lapply(function(dt) {dt[order(start),]}) %>% rbindlist
+    new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end,
+                                          og.node.id, marginal.cn, allele,
+                                          var, nbins, weight, col,
+                                          cn, fix, old.node.id = node.id)],
+                         seqinfo = seqinfo(gg$nodes$gr),
+                         seqlengths = seqlengths(gg$nodes$gr))
+
+    ## create new data.table for edges
+    new.edges.dt = gg$edges$dt
+    
+    ## reset edge endpoints
+    ## browser()
+    dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id")
+    new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]]
+    new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]]
+    ## check if any NA?
+
+    ## reset all edge endpoints to new node.ids
+    ## browser()
+    new.edges.dt[, n1 := match(n1, new.nodes.gr$old.node.id)]
+    new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)]
+
+    ## deduplicate edges
+    ## new.edges.dt = unique(new.edges.dt[!is.na(n1) & !is.na(n2)], by = c("n1", "n1.side", "n2", "n2.side"))
+    new.edges.dt = rbind(new.edges.dt[cnloh == TRUE & class == "REF",],
+                         unique(new.edges.dt[is.na(cnloh) | cnloh == FALSE | class != "REF",],
+                                by = c("n1", "n1.side", "n2", "n2.side")))
+    
+    ## new.edges.dt[, tmp := paste(n1, n1.side, n2, n2.side)]
+    ## new.edges.dt[cnloh == TRUE,]
+    ## new.edges.dt[tmp == "169 right 170 left",]
+    ## new.edges.dt[!duplicated(tmp) & cnloh == TRUE,]
+    ## new.edges.dt$tmp = NULL
+
+    if (verbose) {
+        message("Creating new gGraph")
+    }
+
+    postprocessed.gg = gG(nodes = new.nodes.gr, edges = new.edges.dt)
+    return(postprocessed.gg)
+}
 
 #' @name phased.postprocess
 #' @title phased.postprocess
@@ -2641,6 +2782,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
             ## (major.right.cn == TRUE | minor.right.cn == TRUE) & (right.telomeric == FALSE)]
 
     ## shift the end points
+    ## browser()
     seed.dt[left.alt == TRUE & width > 1, start := start + 1]
     seed.dt[right.alt == TRUE & width > 1, end := end - 1]
 
@@ -2651,11 +2793,17 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     if (verbose) {
         message("Identifying NA ranges")
         }
-    na.gr = find_na_ranges(gg, min.bins = min.bins)
-    all.seed.gr = gr.reduce(c(seed.gr, na.gr))
+
+    if (min.bins) {
+        na.gr = find_na_ranges(gg, min.bins = min.bins)
+        all.seed.gr = gr.reduce(c(seed.gr, na.gr))
+    } else {
+        na.gr = GRanges()
+        all.seed.gr = seed.gr
+    }
 
     ## disjoin gGraph against this GRanges
-    gg = gg$disjoin(all.seed.gr, collapse = FALSE)
+    gg = gg$copy$disjoin(all.seed.gr, collapse = FALSE)
 
     ## any new edges introduced have to be straight
     gg$edges[is.na(connection)]$mark(connection = "straight")
@@ -2696,15 +2844,18 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
         message("Identifying nodes without CN imbalance")
     }
     ## browser()
-    og.node.balance = dcast.data.table(gg$nodes$dt[, .(og.node.id, allele, cn)], og.node.id ~ allele, value.var = "cn") %>% merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE)
+    og.node.balance = dcast.data.table(gg$nodes$dt[allele %in% c("major", "minor"),
+                                                   .(og.node.id, allele, cn)],
+                                       og.node.id ~ allele, value.var = "cn") %>%
+        merge.data.table(gg$nodes$dt[, .(og.node.id, width)], by = "og.node.id", all.x = TRUE)
 
     og.node.balance[, cn.imbalance := (major != minor)]
     og.node.balance[, cn.total := (major + minor)]
 
-    og.node.balance[, phased := ifelse(cn.imbalance == TRUE & width > 1, TRUE, FALSE)]
+    og.node.balance[, phased := ifelse(cn.imbalance == TRUE | width == 1, TRUE, FALSE)]
 
     ## mark these specifically as being allele-balanced
-    og.node.balance[, ab := phased == FALSE]
+    og.node.balance[, ab := (phased == FALSE)]
 
     if (verbose) {
         message("Identifying nodes in NA stretches")
@@ -2742,7 +2893,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
         }
     }
 
-    ## identify unphased og nodes
+    ## identify unphased og nodes that are NEW (ignore previously marked as unphased)
     unphased.og.nodes = og.node.balance[phased == FALSE, og.node.id]
 
     ## identify corresponding minor/major nodes
@@ -2763,10 +2914,8 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     new.nodes.dt[node.id %in% ab.major.nodes, ab := TRUE]
 
     ## reset CN to total CN
-    new.nodes.dt[node.id %in% unphased.major.nodes, cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]]
-
-    ## fix the CN of all of these nodes
-    ## new.nodes.dt[, fix := 1]
+    new.nodes.dt[node.id %in% unphased.major.nodes,
+                 cn := og.node.balance$cn.total[match(og.node.id, og.node.balance$og.node.id)]]
 
     ## reformat nodes
     new.nodes.dt[allele == "unphased", col := alpha("gray", 0.5)]
@@ -2774,20 +2923,20 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     ## get nodes as GRanges
     new.nodes.dt = new.nodes.dt %>% split(new.nodes.dt$seqnames) %>%
         lapply(function(dt) {dt[order(start),]}) %>% rbindlist
-    new.nodes.gr = dt2gr(new.nodes.dt[, .(seqnames, start, end,
-                                          og.node.id, marginal.cn, allele,
-                                          var, nbins, weight, index, col,
-                                          cn.old, cn, fix, ywid, ab,
-                                          old.node.id = node.id)],
-                         seqinfo = seqinfo(gg$nodes$gr),
-                         seqlengths = seqlengths(gg$nodes$gr))
+    sel.cols = intersect(colnames(new.nodes.dt),
+                         c("seqnames", "start", "end", "og.node.id", "marginal.cn", "allele",
+                           "var", "nbins", "index", "col", "cn", "cn.old", "fix", "ywid", "ab",
+                           "node.id"))
+    new.nodes.dt = new.nodes.dt[, ..sel.cols] %>% setnames("node.id", "old.node.id")
+    new.nodes.gr = dt2gr(new.nodes.dt, seqinfo = seqinfo(gg$nodes$gr), seqlengths = seqlengths(gg$nodes$gr))
 
     ## create new data.table for edges
     new.edges.dt = gg$edges$dt
-    
+
     ## reset edge endpoints
-    ## browser()
-    dt = gg$nodes$dt[, .(og.node.id, allele, node.id)] %>% dcast.data.table(og.node.id ~ allele, value.var = "node.id")
+    dt = gg$nodes$dt[allele %in% c("major", "minor"), .(og.node.id, allele, node.id)] %>%
+        dcast.data.table(og.node.id ~ allele, value.var = "node.id")
+    
     new.edges.dt[(n1 %in% unphased.minor.nodes), n1 := dt$major[match(n1, dt$minor)]]
     new.edges.dt[(n2 %in% unphased.minor.nodes), n2 := dt$major[match(n2, dt$minor)]]
 
@@ -2796,17 +2945,32 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
     new.edges.dt[, n2 := match(n2, new.nodes.gr$old.node.id)]
 
     ## label REF edges as straight or cross based on og.node.id
-    ## browser()
-    ## new.edges.dt[type == "REF" & cn > 0, length(unique(connection)), by = og.edge.id] %>% summary
-    new.edges.dt[type == "REF", orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id]
+    new.edges.dt[type == "REF" &
+                 n1.allele %in% c("major", "minor") &
+                 n2.allele %in% c("major", "minor"),
+                 orientation := .SD$connection[which(.SD$cn > 0)][1], by = og.edge.id]
+
+    new.edges.dt[type == "REF" &
+                 n1.allele %in% c("major", "minor") &
+                 n2.allele %in% c("major", "minor") &
+                 is.na(orientation),
+                 orientation := "straight"]
 
     ## only keep REF edges in the correct orientation (regardless of CN)
-    ## only keep ALT edges with CN > 0
-    new.edges.dt = new.edges.dt[(type == "REF" & connection == orientation) |
-                                (type == "ALT" & cn > 0),]
+    new.edges.dt = new.edges.dt[(type == "REF" &
+                                 (connection == orientation | is.na(orientation))) |
+                                 (type == "ALT" & cn > 0),]
 
     ## deduplicate edges
-    new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)]
+    new.edges.dt = rbind(new.edges.dt[cnloh == TRUE & class == "REF",],
+                         new.edges.dt[is.na(cnloh) | cnloh == FALSE | class != "REF",
+                                      .(connection = connection[1], type = type[1],
+                                        cn = sum(cn, na.rm = TRUE),
+                                        col = col[1]),
+                                      by = c("n1", "n1.side", "n2", "n2.side")],
+                         fill = TRUE)
+                         
+    ## new.edges.dt = new.edges.dt[, .(connection = connection[1], type = type[1], cn = sum(cn, na.rm = TRUE), col = col[1]), by = .(n1, n1.side, n2, n2.side)]
 
     if (verbose) {
         message("Creating new gGraph")
@@ -2819,6 +2983,8 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
 }
 
 
+
+
 #' @name phased.binstats
 #' @title phased.binstats
 #' @description
@@ -2856,7 +3022,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                            vbase.count.thres = 5, vbase.prop.thres = 0.9,
                            min.bins = 3, min.var = 1e-3,
                            max.span = 1e6,
-                           verbose = TRUE, min.width = 1e6, mc.cores = 8)
+                           min.width = 5e3,
+                           fix.tiny = TRUE,
+                           verbose = TRUE, mc.cores = 8)
 {
     if (verbose) {
         message("Checking inputs")
@@ -2962,6 +3130,8 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         gg$edges$mark(cnloh = FALSE)
         gg$edges[internal.edges]$mark(cnloh = TRUE)
 
+        browser()
+
         if (verbose) {
             message("Number of internal edges marked in parent graph: ", length(internal.edges))
         }
@@ -2983,8 +3153,8 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span]
 
         ## compute node overlap with shadow
-        node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr
-        pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap]
+        ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr
+        ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap]
 
         ## mark candidates
         ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
@@ -3262,6 +3432,34 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     }
 
     phased.gg = gG(nodes = phased.gg.nodes, edges = phased.gg.edges)
+    
+    if (fix.tiny) {
+
+        if (verbose) {
+            message("Fixing very tiny nodes")
+        }
+
+        ## browser()
+        ## tiny.nodes = phased.gg$nodes$dt[width < min.width, node.id]
+        ## tiny.edges = phased.gg$edges$dt[(n1 %in% tiny.nodes | n2 %in% tiny.nodes) &
+        ##                                 type == "REF" &
+        ##                                 connection == "cross", edge.id]
+
+        ## phased.gg$edges[tiny.edges]$mark(ub = 0, lb = 0)
+
+        tmp = phased.gg$junctions$dt[, .(edge.id, type, class, connection)]
+        tmp[, span := phased.gg$junctions$span]
+        
+        more.tiny.edges = tmp[class != "REF" & connection == "cross" & span < min.width, edge.id]
+
+        phased.gg$edges[more.tiny.edges]$mark(ub = 0, lb = 0)
+
+        if (verbose) {
+            ## message("Number of tiny nodes: ", length(tiny.nodes))
+            message("Number of tiny edges: ", length(more.tiny.edges))
+        }
+    }
+
 
     if (verbose) {
         message("Formatting gGraph")

From 1ed8c749e411ef173d3fa74c8f76f7306f636bb8 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Wed, 1 Sep 2021 08:54:37 -0400
Subject: [PATCH 29/35] stash new balance

---
 R/balance.R | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 R/balance.R

diff --git a/R/balance.R b/R/balance.R
new file mode 100644
index 00000000..fc132db2
--- /dev/null
+++ b/R/balance.R
@@ -0,0 +1,63 @@
+#' @name balance2
+#' @title balance2
+#'
+#' @description
+#'
+#' beta reimplementation of balance
+#'
+#' @param gg (gGraph) with node fields
+#' - $cn (numeric) CN guess
+#' - $weight (numeric)
+#' @param lambda
+#' @param marginal
+#' @param phased (logical) default FALSE
+#' @param ism (logical) default TRUE
+#' @param M (numeric) default 1000
+#' @param epgap (numeric) default 0.1
+#' @param trelim (numeric) max size of uncompressed tree in GB
+#' @param nodefileind (numeric) one of 0 (no node file) 1 (in memory compressed) 2 (on disk uncompressed) 3 (on disk compressed) default 1
+#' @return junction-balanced gGraph
+balance2 = function(gg = NULL,
+                    lambda = 100,
+                    marginal = NULL,
+                    phased = FALSE,
+                    ism = TRUE,
+                    M = 1000,
+                    epgap = 0.1,
+                    trelim = 16,
+                    nodefileind = 1) {
+    
+}
+
+#' @name check_balance_inputs
+#' @title check_balance_inputs
+#'
+#' @description
+#'
+#' Validate input gGraph for balance
+#'
+#' @param gg (gGraph)
+check_balance_inputs = function(gg) {
+}
+
+#' @name create_node_variables
+#' @title create_node_variables
+#'
+#' @description
+#' create node CN, node residual, and helper variables for nodes
+#' 
+#' @param gg
+#' @return data.table
+create_node_variables = function(gg) {
+}
+
+#' @name create_edge_variables
+#' @title create_edge_variables
+#'
+#' @description
+#' edge CN, edge residual, and helper variables for edges
+#'
+#' @param gg
+#' @return data.table
+create_edge_variables = function(gg) {
+}

From d3ba43ba28e6f93d50449bf8d53c07267d1409fe Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Wed, 1 Sep 2021 10:29:26 -0400
Subject: [PATCH 30/35] initial commit for phasing with haplotypes

---
 R/apps.R | 208 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 141 insertions(+), 67 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 60cdf063..930197f8 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -377,32 +377,6 @@ balance = function(gg,
         ## need delta plus and delta minus for nodes and edges
         delta.node = gg$dt[tight == FALSE, .(gid = index, cn, weight, vtype = 'C')]
         delta.edge = gg$sedgesdt[, .(gid = sedge.id, cn, weight, vtype = 'C')]
-
-        ## make sure deltas obey preset upper and lower bounds for edge id
-        ## if ("lb" %in% colnames(gg$dt)) {
-        ##     delta.node[, lb := gg$dt$lb[match(gid, gg$dt$index)]]
-        ## } else {
-        ##     delta.node[, lb := 0]
-        ## }
-
-        ## if ("ub" %in% colnames(gg$dt)) {
-        ##     delta.node[, ub := gg$dt$ub[match(gid, gg$dt$index)]]
-        ## } else {
-        ##     delta.node[, ub := M]
-        ## }
-
-        ## ## make sure deltas obey preset upper and lower bounds for edge id
-        ## if ("lb" %in% colnames(gg$sedgesdt)) {
-        ##     delta.edge[, lb := gg$sedgesdt$lb[match(gid, gg$sedgesdt$sedge.id)]]
-        ## } else {
-        ##     delta.edge[, lb := 0]
-        ## }
-
-        ## if ("ub" %in% colnames(gg$sedgesdt)) {
-        ##     delta.edge[, ub := gg$sedgesdt$ub[match(gid, gg$sedgesdt$sedge.id)]]
-        ## } else {
-        ##     delta.edge[, ub := M]
-        ## }
         
         deltas = rbind(
             delta.node[, .(gid, weight, vtype, type = "ndelta.plus")],
@@ -411,11 +385,6 @@ balance = function(gg,
             delta.edge[, .(gid, weight, vtype, type = "edelta.minus")]
         )
 
-        ## deltas[lb < 0, lb := 0]
-        ## deltas[ub > M, ub := M]
-        ## deltas[is.na(lb), lb := 0]
-        ## deltas[is.na(ub), ub := M]
-
         vars = rbind(
             vars,
             deltas,
@@ -448,14 +417,42 @@ balance = function(gg,
         vars[, ":="(allele = gg$dt$allele[node.match],
                     og.node.id = gg$dt$og.node.id[node.match])]
 
+        
+        
 
         ## add ref/alt information and og.edge.id
         edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id)
-        vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match], ## need type info but rename column...
-                    og.edge.id = gg$sedgesdt$og.edge.id[edge.match])]
+        vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match],
+                    connection = gg$sedgesdt$connection[edge.match],
+                    og.edge.id = gg$sedgesdt$og.edge.id[edge.match],
+                    n1 = gg$dt$snode.id[gg$sedgesdt$from[edge.match]],
+                    n2 = gg$dt$snode.id[gg$sedgesdt$to[edge.match]])]
+
+        vars[, n1.side := ifelse(n1 > 0, "right", "left")]
+        vars[, n2.side := ifelse(n2 > 0, "left", "right")]
+        vars[, n1 := abs(n1)]
+        vars[, n2 := abs(n2)]
 
         edge.indicator.vars = vars[type == "edge"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id]
         vars = rbind(vars, edge.indicator.vars, fill = TRUE)
+
+        ## add node haplotype indicators
+        ## these are binary indicators that determine whether a node belongs to H1
+        ## only constrain positive-stranded nodes due to skew symmetry
+        haplotype.indicators = gg$dt[allele == "major" | allele == "minor" & snode.id > 0,
+                                     .(cn, snode.id, lb, ub, weight, og.node.id,
+                                       allele, gid = index, type = 'haplotype', vtype = 'B')]
+        vars = rbind(vars, haplotype.indicators, fill = TRUE)
+
+        ## add H1 and H2 'AND' indicators which should have n1/n1.side/n2/n2.side metadata
+        h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id]
+        h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id]
+
+        vars = rbind(vars, h1.and.indicators, h2.and.indicators, fill = TRUE)
+
+        browser()
+        ## check h1/h2 indicators have n1/n1.side/n2/n2.side metadata
+                                 
     }
 
     if (ism) {
@@ -515,9 +512,6 @@ balance = function(gg,
             straight.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "straight.config"][, config.id := paste("straight", og.edge.id)], by = "og.edge.id")
             cross.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "cross.config"][, config.id := paste("cross", og.edge.id)], by = "og.edge.id")
 
-            ## add straight/cross to REF edges
-            vars[type == "edge.indicator" & ref.or.alt == "REF",
-                 connection := gg$sedgesdt$connection[match(sedge.id, gg$sedgesdt$sedge.id)]]
             
             ## add config ID's to corresponding edge indicators
             vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0,
@@ -718,7 +712,85 @@ balance = function(gg,
   }
 
     if (phased) {
-        
+
+        ## add haplotype indicator constraints
+        ## e.g. the haplotype indicators corresponding to the same og node must add up to 1
+        iconstraints = vars[type == "haplotype" & snode.id > 0,
+                            .(value = 1, id, cid = paste("haplotype.indicator", og.node.id))]
+        rhs = vars[type == "haplotype" & !duplicated(og.node.id),
+                   .(value = 1, sense = "E", cid = paste("haplotype.indicator", og.node.id))]
+
+        constraints = rbind(constraints, iconstraints, fill = TRUE)
+        b = rbind(b, rhs, fill = TRUE)
+
+        ## add H1 AND constraint
+        h1.and.ids = merge.data.table(vars[type == "h1.and.indicator", .(n1, n2, edge.id = id, sedge.id)],
+                                      vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)],
+                                      by.x = "n1",
+                                      by.y = "n1.snode.id") %>%
+            merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)],
+                             by.x = "n1",
+                             by.y = "n2.snode.id")
+
+        h2.and.ids = merge.data.table(vars[type == "h2.and.indicator", .(n1, n2, edge.id = id, sedge.id)],
+                                      vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)],
+                                      by.x = "n1",
+                                      by.y = "n1.snode.id") %>%
+            merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)],
+                             by.x = "n1",
+                             by.y = "n2.snode.id")
+
+        ## verify only + sedge id
+        browser()
+
+        ## there are four constraints that are needed to implement this first edge constraint (c1-3)
+        iconstraints = rbind(h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c1", sedge.id))],
+                             h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c1", sedge.id))],
+                             h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c2", sedge.id))],
+                             h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c2", sedge.id))],
+                             h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c3", sedge.id))],
+                             h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c3", sedge.id))],
+                             h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c3", sedge.id))])
+
+        rhs = rbind(h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c1", sedge.id))],
+                    h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c2", sedge.id))],
+                    h1.and.ids[, .(value = -1, sense = "G", cid = paste("h1.and.c3", sedge.id))])
+
+        constraints = rbind(constraints, iconstraints, fill = TRUE)
+        b = rbind(b, rhs, fill = TRUE)
+
+        iconstraints = rbind(h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c1", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c1", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c2", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c2", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c3", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c3", sedge.id))],
+                             h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c3", sedge.id))])
+
+        rhs = rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))],
+                    h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))],
+                    h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))])
+
+        constraints = rbind(constraints, iconstraints, fill = TRUE)
+        b = rbind(b, rhs, fill = TRUE)
+
+        ## verify that there are no weird NA's and that there is only one set of constraints per sedge.id
+
+        ## connect edge indicators to the haplotype configuration of connected edges
+        iconstraints = rbind(vars[type == "h1.and.indicator",
+                                  .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))],
+                             vars[type == "h2.and.indicator",
+                                  .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))],
+                             vars[type == "edge.indicator" &
+                                  (sedge.id %in% vars[type == "h1.and.indicator",]$sedge.id),
+                                  .(value = 1, id, cid = paste("haplotype.indicator", sedge.id))])
+        rhs = unique(iconstraints[, .(value = 0, sense = "L", cid)], by = "cid")
+
+        constraints = rbind(constraints, iconstraints, fill = TRUE)
+        b = rbind(b, rhs, fill = TRUE)
+
+        ## verify that there are three of these per sedge.id!
+                                  
         ## add constraints that force indicators to be 1 if edge CN > 0
         
         ## add constraints for upper bound (same setup as L0 penalty) - one per edge
@@ -1143,42 +1215,44 @@ balance = function(gg,
         ## b = rbind(b, rhs, fill = TRUE)
 
         ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT)
+        #' zchoo Wednesday, Sep 01, 2021 10:27:47 AM
+        ## this set of constraints is no longer necessary as node haplotypes have been added
         
-        iconstraints.from = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
-                 .(value = 1, id,
-                   edge.id = abs(sedge.id),
-                   snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
-                   cid = paste("ref.configuration.constraint.from", from))],
-            by = "edge.id"
-        )
+        ## iconstraints.from = unique(
+        ##     vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
+        ##          .(value = 1, id,
+        ##            edge.id = abs(sedge.id),
+        ##            snode.id = from, ## this is actually a misleading name because from is the row in gg$dt
+        ##            cid = paste("ref.configuration.constraint.from", from))],
+        ##     by = "edge.id"
+        ## )
 
-        iconstraints.to = unique(
-            vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
-                 .(value = 1, id,
-                   edge.id = abs(sedge.id),
-                   snode.id = to,
-                   cid = paste("ref.configuration.constraint.to", to))],
-            by = "edge.id"
-        )
+        ## iconstraints.to = unique(
+        ##     vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
+        ##          .(value = 1, id,
+        ##            edge.id = abs(sedge.id),
+        ##            snode.id = to,
+        ##            cid = paste("ref.configuration.constraint.to", to))],
+        ##     by = "edge.id"
+        ## )
 
-        iconstraints = rbind(iconstraints.from, iconstraints.to)
+        ## iconstraints = rbind(iconstraints.from, iconstraints.to)
 
-        ## sum to at most 1 if phased, unconstrained if unphased
-        iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
+        ## ## sum to at most 1 if phased, unconstrained if unphased
+        ## iconstraints[, ":="(allele = gg$dt$allele[iconstraints$snode.id])]
         
-        edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
-                                               .(value = 1, sense = "L", cid)],
-                                  by = "cid")
-
-        constraints = rbind(
-            constraints,
-            iconstraints[allele %in% c("major", "minor"),
-                         .(value, id, cid)],
-            fill = TRUE)
+        ## edge.indicator.b = unique(iconstraints[allele %in% c("major", "minor"),
+        ##                                        .(value = 1, sense = "L", cid)],
+        ##                           by = "cid")
+
+        ## constraints = rbind(
+        ##     constraints,
+        ##     iconstraints[allele %in% c("major", "minor"),
+        ##                  .(value, id, cid)],
+        ##     fill = TRUE)
         
-        ## add to b
-        b = rbind(b, edge.indicator.b, fill = TRUE)
+        ## ## add to b
+        ## b = rbind(b, edge.indicator.b, fill = TRUE)
 
     }
 

From 63592c114449005d01b2c526dfed73758b52587d Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 13 Sep 2021 14:06:44 -0400
Subject: [PATCH 31/35] stash changes for phasing

---
 R/apps.R | 1220 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 659 insertions(+), 561 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 930197f8..a41a431d 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -69,8 +69,9 @@
 #' @param nodefileind (numeric) one of 0 (no node file) 1 (in memory compressed) 2 (on disk uncompressed) 3 (on disk compressed) default 1
 #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE)
 #' @param gurobi (logical) use gurobi if TRUE uses gurobi else CPLEX default FALSE
-#' 
-
+#' @param force.haplotypes (logical) default TRUE
+#' @param max.span (numeric) the maximum span of an edge below which both endpoints must be on the same parental haplotype default 1e9
+#'
 #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda.
 #' @author Marcin Imielinski
 #' 
@@ -85,9 +86,9 @@ balance = function(gg,
                    loose.collapse = FALSE,
                    M = 1e3,
                    phased = FALSE,
-                   ism = FALSE,
-                   force.major = FALSE,
-                   force.alt = TRUE,
+                   ism = TRUE,
+                   force.major = TRUE,
+                   force.alt = FALSE,
                    cnloh = FALSE,
                    lp = TRUE,
                    verbose = 1,
@@ -95,6 +96,7 @@ balance = function(gg,
                    trelim = 32e3,
                    nodefileind = 1,
                    epgap = 1e-3,
+                   max.span = 1e9, ## max span in bp
                    debug = FALSE)
 {
     if (verbose) {
@@ -138,7 +140,7 @@ balance = function(gg,
             stop("cannot run phased balance without $allele field in nodes")
         }
     }
-        
+    
     if (!is.null(marginal)) {
         if (!inherits(marginal, 'GRanges') || is.null(marginal$cn)) {
             stop('marginal must be a GRanges with field $cn')
@@ -176,7 +178,7 @@ balance = function(gg,
             emarginal$set(weight = 1)
         }
     }
-             
+    
     ## default local lambda: default local lambda is 1 for consistency with JaBbA
     if (!('lambda' %in% names(gg$nodes$dt)))
         gg$nodes$mark(lambda = 1)
@@ -199,7 +201,7 @@ balance = function(gg,
     {
         gg$edges$mark(reward = 0)
     }
-  
+    
     ## handle parsing of efix, nfix, nrelax, erelax
     if (!any(deparse(substitute(nfix)) == "NULL")) ## R voodo to allow "with" style evaluation 
         nfix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nfix)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL)
@@ -265,78 +267,78 @@ balance = function(gg,
     if (is.null(gg$edges$dt$ub))
         gg$edges$mark(ub = Inf)
 
-  if (loose.collapse)
-  {
-    if (verbose)
-      message('Collapsing loose ends')
-
-    uleft = unique(gr.start(gg$nodes$gr))
-    uright = unique(gr.end(gg$nodes$gr))
-    
-    gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l'))
-    gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r'))      
-  }
-  else
-  {
-    gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l'))
-    gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r'))
-  }
+    if (loose.collapse)
+    {
+        if (verbose)
+            message('Collapsing loose ends')
 
-  ########
-  ## VARIABLES
-  ########
-
-  ## create state space, keeping track of graph ids
-  vars = rbind(
-    gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes
-    gg$sedgesdt[, .(from, to, lb, ub, sedge.id,  cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges
-
-    ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE
-    ## will be defined on the basis of coordinate overlap)
-    gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
-                            ulid = paste0(index, 'i'),
-                            lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)),
-                            type = 'loose.in', vtype = 'I')], ## incoming loose ends
-    gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
-                            ulid = paste0(index, 'o'),
-                            lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)),
-                            type = 'loose.out', vtype = 'I')], ## outgoing loose ends
-    gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual 
-    gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual 
-    fill = TRUE)
-
-  if (L0)
-  {
-    ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true
-    ## (i.e. we need indicator.sum and indicator.sum.indicator
-    if (verbose) {
-      message("adding l0 penalty indicator")
+        uleft = unique(gr.start(gg$nodes$gr))
+        uright = unique(gr.end(gg$nodes$gr))
+        
+        gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l'))
+        gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r'))      
+    }
+    else
+    {
+        gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l'))
+        gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r'))
     }
 
-    vars = rbind(vars, 
-                 rbind( 
-                   vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid],
-                   vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid]
-                 ))
+########
+    ## VARIABLES
+########
+
+    ## create state space, keeping track of graph ids
+    vars = rbind(
+        gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], ## signed nodes
+        gg$sedgesdt[, .(from, to, lb, ub, sedge.id,  cn, reward, gid = sedge.id, type = 'edge', vtype = 'I')], ## signed edges
+
+        ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE
+        ## will be defined on the basis of coordinate overlap)
+        gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
+                                ulid = paste0(index, 'i'),
+                                lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)),
+                                type = 'loose.in', vtype = 'I')], ## incoming loose ends
+        gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index,
+                                ulid = paste0(index, 'o'),
+                                lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)),
+                                type = 'loose.out', vtype = 'I')], ## outgoing loose ends
+        gg$dt[tight == FALSE, .(gid = index, cn, weight, type = 'nresidual', vtype = 'C')], ## node residual 
+        gg$sedgesdt[, .(gid = sedge.id, cn, weight, type = 'eresidual', vtype = 'C')], ## edge residual 
+        fill = TRUE)
 
-    if (loose.collapse)
+    if (L0)
     {
-      ## sum will sum all the loose ends assocaited with the same lid
-      vars = rbind(vars, 
-                   unique(rbind( 
-                     vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid],
-                     vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid]
-                   ), by = 'gid'))
-      
-      ## sum.indicator is an binary indicator on the sum
-      vars = rbind(vars, 
-                   rbind( 
-                     vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid],
-                     vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid]
-                   ))        
-    }        
-  }
-  
+        ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true
+        ## (i.e. we need indicator.sum and indicator.sum.indicator
+        if (verbose) {
+            message("adding l0 penalty indicator")
+        }
+
+        vars = rbind(vars, 
+                     rbind( 
+                         vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid],
+                         vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid]
+                     ))
+
+        if (loose.collapse)
+        {
+            ## sum will sum all the loose ends assocaited with the same lid
+            vars = rbind(vars, 
+                         unique(rbind( 
+                             vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid],
+                             vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid]
+                         ), by = 'gid'))
+            
+            ## sum.indicator is an binary indicator on the sum
+            vars = rbind(vars, 
+                         rbind( 
+                             vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid],
+                             vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid]
+                         ))        
+        }        
+    }
+    
     if (!is.null(marginal)) {
         ## first disjoin marginal against the nodes
         ## ie wee ned to create a separate residual variable for every unique
@@ -410,6 +412,23 @@ balance = function(gg,
         }
     }
 
+    #' zchoo Thursday, Sep 02, 2021 11:16:57 AM
+    ## moved from being ISM-specific annotation as this is more generally useful information
+    ## add telomeric annotation
+    qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
+    term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
+                -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
+    term.out = -term.in ## out is reciprocal of in
+
+    ## annotate loose indicators with this
+    vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in |
+                                               snode.id %in% term.out,
+                                               TRUE,
+                                               FALSE)]
+
+    
+
+
 
     if (phased) {
         ## add allele information and og.node.id
@@ -425,6 +444,7 @@ balance = function(gg,
         vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match],
                     connection = gg$sedgesdt$connection[edge.match],
                     og.edge.id = gg$sedgesdt$og.edge.id[edge.match],
+                    span = gg$sedgesdt$span[edge.match], ## NEED SPAN
                     n1 = gg$dt$snode.id[gg$sedgesdt$from[edge.match]],
                     n2 = gg$dt$snode.id[gg$sedgesdt$to[edge.match]])]
 
@@ -436,35 +456,77 @@ balance = function(gg,
         edge.indicator.vars = vars[type == "edge"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id]
         vars = rbind(vars, edge.indicator.vars, fill = TRUE)
 
+        #' zchoo Thursday, Sep 02, 2021 05:30:53 PM
+        #' moved to earlier
+        ## REF edge configuration constraint (added by default basically)
+        ## only add this if there are no unphased nodes
+        if (cnloh) {
+            
+            ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2
+            ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges
+
+            ## penalize CNLOH edges
+
+            if (!is.null(gg$edges$dt$cnloh)) {
+                cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique
+                if (verbose) {
+                    message("Number of marked CNLOH edges: ", length(cnloh.edges))
+                }
+
+                ## add CNLOH annotation to variables
+                ## browser()
+                vars[, cnloh := FALSE]
+                vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") &
+                     ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges),
+                     ":="(cnloh = TRUE)]
+                
+            } else {
+                warning("CNLOH not specified on edges. Disallowing!")
+                cnloh.og.edges = c()
+                vars[, cnloh := FALSE]
+            }
+        } else {
+
+            cnloh.og.edges = c()
+            vars[, cnloh := FALSE]
+            
+        }
+
         ## add node haplotype indicators
         ## these are binary indicators that determine whether a node belongs to H1
         ## only constrain positive-stranded nodes due to skew symmetry
-        haplotype.indicators = gg$dt[allele == "major" | allele == "minor" & snode.id > 0,
+        haplotype.indicators = gg$dt[(allele == "major" | allele == "minor") & snode.id > 0,
                                      .(cn, snode.id, lb, ub, weight, og.node.id,
                                        allele, gid = index, type = 'haplotype', vtype = 'B')]
         vars = rbind(vars, haplotype.indicators, fill = TRUE)
 
         ## add H1 and H2 'AND' indicators which should have n1/n1.side/n2/n2.side metadata
-        h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id]
-        h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross"),][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id]
+        ## only add these for low-span edges
+        ## and where CNLOH is FALSE
+        h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id]
+        h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id]
 
         vars = rbind(vars, h1.and.indicators, h2.and.indicators, fill = TRUE)
 
-        browser()
+        ## browser()
+        ## h1.and.indicators
+        ## vars[cnloh,]
         ## check h1/h2 indicators have n1/n1.side/n2/n2.side metadata
-                                 
+        
     }
 
     if (ism) {
 
-        ## add telomeric annotation
-        qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
-        term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
-                    -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
-        term.out = -term.in ## out is reciprocal of in
+        #' zchoo Thursday, Sep 02, 2021 11:15:11 AM
+        ## moved telomeric annotation
+        ## ## add telomeric annotation
+        ## qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
+        ## term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
+        ##             -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
+        ## term.out = -term.in ## out is reciprocal of in
 
-        ## annotate loose indicators with this
-        vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)]
+        ## ## annotate loose indicators with this
+        ## vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)]
         
         ## if not phased, must add edge indicators (for just the ALT edges)
         if (!phased) {
@@ -581,7 +643,7 @@ balance = function(gg,
     ## vars[type %in% c('node', 'edge'), lb := ifelse(is.na(lb), 0, pmax(lb, 0, na.rm = TRUE)]
     ## vars[type %in% c('node', 'edge'), ub := ifelse(is.na(ub), M, pmin(ub, M, na.rm = TRUE))]
     vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)]
-  
+    
     vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)]
 
 
@@ -601,135 +663,146 @@ balance = function(gg,
     vars[(type %in% c('loose.in', 'loose.in.indicator')) & (snode.id %in% term.in), terminal := TRUE]
     vars[(type %in% c('loose.out', 'loose.out.indicator')) & (snode.id %in% term.out), terminal := TRUE]
 
-  ########
-  ## CONSTRAINTS
-  ## the key principle behind this "melted" form of constraint building is the cid 
-  ## (constraint id) which is the key that will group coefficients into constraints
-  ## when we finally build the matrices.  So all we need to do is make sure that
-  ## that value / cid pairs make sense and that every cid has an entry in b
-  ########
-
-  ## we need one junction balance constraint per loose end
-
-  ## constraints indexed by cid
-  constraints = rbind(
-    vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))],
-    vars[type == 'edge', .(value = 1, id, cid = paste('in', to))],
-    vars[type == 'node', .(value = -1, id, cid = paste('in', gid))],
-    vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))],
-    vars[type == 'edge', .(value = 1, id, cid = paste('out', from))],
-    vars[type == 'node', .(value = -1, id, cid = paste('out', gid))],
-   fill = TRUE)
-
-  b = rbind(
-    vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))],
-    vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))],
-    fill = TRUE)
-
-  ## add to the constraints the definitions of the node and edge
-  ## residuals
-  constraints = rbind(
-    constraints,
-    rbind(
-      vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))],
-      vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))],
-      vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))],
-      vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))],
-      fill = TRUE)
-  )
-
-  b = rbind(b,
-            vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))],
-            vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))],
-            fill = TRUE)
-  
-  ## add the reverse complement equality constraints on nodes and edges
-  constraints = rbind(
-    constraints,
-    rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id)
-      vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))],
-      vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))],
-      fill = TRUE)
-  )
-  
-  b = rbind(b,
-            vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))],
-            vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))],
-            fill = TRUE)
+    
 
 
-  ## if solving as LP, add deltas constraints (absolute value trick)
+########
+    ## CONSTRAINTS
+    ## the key principle behind this "melted" form of constraint building is the cid 
+    ## (constraint id) which is the key that will group coefficients into constraints
+    ## when we finally build the matrices.  So all we need to do is make sure that
+    ## that value / cid pairs make sense and that every cid has an entry in b
+########
 
-  if (lp) {
-    if (verbose) {
-      message("adding delta constraints for LP")
-    }
+    ## we need one junction balance constraint per loose end
 
-    vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)]
+    ## constraints indexed by cid
+    constraints = rbind(
+        vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))],
+        vars[type == 'edge', .(value = 1, id, cid = paste('in', to))],
+        vars[type == 'node', .(value = -1, id, cid = paste('in', gid))],
+        vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))],
+        vars[type == 'edge', .(value = 1, id, cid = paste('out', from))],
+        vars[type == 'node', .(value = -1, id, cid = paste('out', gid))],
+        fill = TRUE)
 
-    ## add the residual constraints
-    ndelta.slack = rbind(
-      vars[type == "nresidual", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))],
-      vars[type == "ndelta.minus", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))],
-      vars[type == "nresidual", .(value = 1, id, cid = paste("ndelta.plus.slack", gid))],
-      vars[type == "ndelta.plus", .(value = -1, id, cid = paste("ndelta.plus.slack", gid))]
-    )
+    b = rbind(
+        vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))],
+        vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))],
+        fill = TRUE)
 
-    ndelta.slack.rhs = rbind(
-      vars[type == "ndelta.minus", .(value = 0, sense = "L", cid = paste("ndelta.minus.slack", gid))],
-      vars[type == "ndelta.plus", .(value = 0, sense = "L", cid = paste("ndelta.plus.slack", gid))]
+    ## add to the constraints the definitions of the node and edge
+    ## residuals
+    constraints = rbind(
+        constraints,
+        rbind(
+            vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))],
+            vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))],
+            vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))],
+            vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))],
+            fill = TRUE)
     )
 
-    edelta.slack = rbind(
-      vars[type == "eresidual", .(value = -1, id, cid = paste("edelta.minus.slack", gid))],
-      vars[type == "edelta.minus", .(value = -1, id, cid = paste("edelta.minus.slack", gid))],
-      vars[type == "eresidual", .(value = 1, id, cid = paste("edelta.plus.slack", gid))],
-      vars[type == "edelta.plus", .(value = -1, id, cid = paste("edelta.plus.slack", gid))]
+    b = rbind(b,
+              vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))],
+              vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))],
+              fill = TRUE)
+    
+    ## add the reverse complement equality constraints on nodes and edges
+    constraints = rbind(
+        constraints,
+        rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id)
+            vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))],
+            vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))],
+            fill = TRUE)
     )
+    
+    b = rbind(b,
+              vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))],
+              vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))],
+              fill = TRUE)
 
-    edelta.slack.rhs = rbind(
-      vars[type == "edelta.minus", .(value = 0, sense = "L", cid = paste("edelta.minus.slack", gid))],
-      vars[type == "edelta.plus", .(value = 0, sense = "L", cid = paste("edelta.plus.slack", gid))]
-    )
 
-    mdelta.slack = rbind(
-      vars[type == "mresidual", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))],
-      vars[type == "mdelta.minus", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))],
-      vars[type == "mresidual", .(value = 1, id, cid = paste("mdelta.plus.slack", gid))],
-      vars[type == "mdelta.plus", .(value = -1, id, cid = paste("mdelta.plus.slack", gid))]
-    )
+    ## if solving as LP, add deltas constraints (absolute value trick)
 
-    mdelta.slack.rhs = rbind(
-      vars[type == "mdelta.minus", .(value = 0, sense = "L", cid = paste("mdelta.minus.slack", gid))],
-      vars[type == "mdelta.plus", .(value = 0, sense = "L", cid = paste("mdelta.plus.slack", gid))]
-    )
+    if (lp) {
+        if (verbose) {
+            message("adding delta constraints for LP")
+        }
 
-    constraints = rbind(constraints, ndelta.slack, edelta.slack, mdelta.slack, fill = TRUE)
-    b = rbind(b, ndelta.slack.rhs, edelta.slack.rhs, mdelta.slack.rhs, fill = TRUE)
+        vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)]
 
-    ## browser()
+        ## add the residual constraints
+        ndelta.slack = rbind(
+            vars[type == "nresidual", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))],
+            vars[type == "ndelta.minus", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))],
+            vars[type == "nresidual", .(value = 1, id, cid = paste("ndelta.plus.slack", gid))],
+            vars[type == "ndelta.plus", .(value = -1, id, cid = paste("ndelta.plus.slack", gid))]
+        )
 
-  }
+        ndelta.slack.rhs = rbind(
+            vars[type == "ndelta.minus", .(value = 0, sense = "L", cid = paste("ndelta.minus.slack", gid))],
+            vars[type == "ndelta.plus", .(value = 0, sense = "L", cid = paste("ndelta.plus.slack", gid))]
+        )
+
+        edelta.slack = rbind(
+            vars[type == "eresidual", .(value = -1, id, cid = paste("edelta.minus.slack", gid))],
+            vars[type == "edelta.minus", .(value = -1, id, cid = paste("edelta.minus.slack", gid))],
+            vars[type == "eresidual", .(value = 1, id, cid = paste("edelta.plus.slack", gid))],
+            vars[type == "edelta.plus", .(value = -1, id, cid = paste("edelta.plus.slack", gid))]
+        )
+
+        edelta.slack.rhs = rbind(
+            vars[type == "edelta.minus", .(value = 0, sense = "L", cid = paste("edelta.minus.slack", gid))],
+            vars[type == "edelta.plus", .(value = 0, sense = "L", cid = paste("edelta.plus.slack", gid))]
+        )
+
+        mdelta.slack = rbind(
+            vars[type == "mresidual", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))],
+            vars[type == "mdelta.minus", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))],
+            vars[type == "mresidual", .(value = 1, id, cid = paste("mdelta.plus.slack", gid))],
+            vars[type == "mdelta.plus", .(value = -1, id, cid = paste("mdelta.plus.slack", gid))]
+        )
+
+        mdelta.slack.rhs = rbind(
+            vars[type == "mdelta.minus", .(value = 0, sense = "L", cid = paste("mdelta.minus.slack", gid))],
+            vars[type == "mdelta.plus", .(value = 0, sense = "L", cid = paste("mdelta.plus.slack", gid))]
+        )
+
+        constraints = rbind(constraints, ndelta.slack, edelta.slack, mdelta.slack, fill = TRUE)
+        b = rbind(b, ndelta.slack.rhs, edelta.slack.rhs, mdelta.slack.rhs, fill = TRUE)
+
+        ## browser()
+
+    }
 
     if (phased) {
 
         ## add haplotype indicator constraints
         ## e.g. the haplotype indicators corresponding to the same og node must add up to 1
         iconstraints = vars[type == "haplotype" & snode.id > 0,
-                            .(value = 1, id, cid = paste("haplotype.indicator", og.node.id))]
-        rhs = vars[type == "haplotype" & !duplicated(og.node.id),
-                   .(value = 1, sense = "E", cid = paste("haplotype.indicator", og.node.id))]
+                            .(value = 1, id, cid = paste("haplotype.node", og.node.id))]
+        rhs = unique(vars[type == "haplotype",
+                          .(value = 1, sense = "E", cid = paste("haplotype.node", og.node.id))],
+                     by = "cid")
 
         constraints = rbind(constraints, iconstraints, fill = TRUE)
         b = rbind(b, rhs, fill = TRUE)
 
+        ## check that there are two per og edge id
+        ## browser()
+        ## tmp = iconstraints[, .(count = .N), by = cid]
+        ## all(tmp[, count] == 2, na.rm = TRUE)
+        ## length(unique(rhs[, cid])) == length(unique(iconstraints[, cid]))
+        ## length(unique(rhs[, cid])) == length(unique(gg$nodes$dt[allele == "major", og.node.id]))
+
         ## add H1 AND constraint
         h1.and.ids = merge.data.table(vars[type == "h1.and.indicator", .(n1, n2, edge.id = id, sedge.id)],
                                       vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)],
                                       by.x = "n1",
                                       by.y = "n1.snode.id") %>%
             merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)],
-                             by.x = "n1",
+                             by.x = "n2",
                              by.y = "n2.snode.id")
 
         h2.and.ids = merge.data.table(vars[type == "h2.and.indicator", .(n1, n2, edge.id = id, sedge.id)],
@@ -737,11 +810,11 @@ balance = function(gg,
                                       by.x = "n1",
                                       by.y = "n1.snode.id") %>%
             merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)],
-                             by.x = "n1",
+                             by.x = "n2",
                              by.y = "n2.snode.id")
 
         ## verify only + sedge id
-        browser()
+        ## browser()
 
         ## there are four constraints that are needed to implement this first edge constraint (c1-3)
         iconstraints = rbind(h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c1", sedge.id))],
@@ -759,6 +832,12 @@ balance = function(gg,
         constraints = rbind(constraints, iconstraints, fill = TRUE)
         b = rbind(b, rhs, fill = TRUE)
 
+        ## tmp = iconstraints[, .(count = .N), by = cid]
+        ## all(tmp[cid %like% "c1", count] == 2)
+        ## all(tmp[cid %like% "c2", count] == 2)
+        ## all(tmp[cid %like% "c3", count] == 3)
+
+
         iconstraints = rbind(h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c1", sedge.id))],
                              h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c1", sedge.id))],
                              h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c2", sedge.id))],
@@ -767,14 +846,19 @@ balance = function(gg,
                              h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c3", sedge.id))],
                              h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c3", sedge.id))])
 
-        rhs = rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))],
-                    h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))],
-                    h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))])
+        rhs = unique(rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))],
+                           h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))],
+                           h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))]),
+                     by = "cid")
 
         constraints = rbind(constraints, iconstraints, fill = TRUE)
         b = rbind(b, rhs, fill = TRUE)
 
         ## verify that there are no weird NA's and that there is only one set of constraints per sedge.id
+        ## tmp = iconstraints[, .(count = .N), by = cid]
+        ## all(tmp[cid %like% "c1", count] == 2)
+        ## all(tmp[cid %like% "c2", count] == 2)
+        ## all(tmp[cid %like% "c3", count] == 3)
 
         ## connect edge indicators to the haplotype configuration of connected edges
         iconstraints = rbind(vars[type == "h1.and.indicator",
@@ -790,7 +874,9 @@ balance = function(gg,
         b = rbind(b, rhs, fill = TRUE)
 
         ## verify that there are three of these per sedge.id!
-                                  
+        ## tmp = iconstraints[, .(count = .N), by = cid]
+        ## all(tmp[, count] == 3)
+        
         ## add constraints that force indicators to be 1 if edge CN > 0
         
         ## add constraints for upper bound (same setup as L0 penalty) - one per edge
@@ -905,7 +991,7 @@ balance = function(gg,
         #' zchoo Tuesday, Jun 15, 2021 11:53:15 AM
         #' this constraint appears to be valid even if running phasing.
         ## if (!phased) {
-            ## extremity exclusivity (relevant for ALL graphs)
+        ## extremity exclusivity (relevant for ALL graphs)
         loose.constraints = rbind(
             vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
                  .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
@@ -927,25 +1013,27 @@ balance = function(gg,
 
         b = rbind(b, edge.b, loose.b, fill = TRUE)
 
-            ## edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
-            ## edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
+        ## edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
+        ## edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
 
-            ## loose.zeros = rbind(
-            ##     vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
-            ##          .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-            ##     vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
-            ##          .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
-            ## )
+        ## loose.zeros = rbind(
+        ##     vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+        ##          .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
+        ##     vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+        ##          .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
+        ## )
 
-            ## loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
+        ## loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
 
-            ## constraints = rbind(constraints, loose.zeros, fill = TRUE)
-            ## b = rbind(b, loose.zeros.rhs, fill = TRUE)
-        }
+        ## constraints = rbind(constraints, loose.zeros, fill = TRUE)
+        ## b = rbind(b, loose.zeros.rhs, fill = TRUE)
+    }
 
-        if (phased) {
-            ## homologous extremity exclusivity
-            ## this is actually redundant with previous constraints
+    if (phased) {
+        ## homologous extremity exclusivity
+        ## this is actually redundant with previous constraints
+
+        if (ism) {
             loose.constraints = rbind(
                 vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE,
                      .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))],
@@ -977,26 +1065,28 @@ balance = function(gg,
             }
 
             ## grab node ids associated with ALT edges on the left
-            left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
-                                 gg$edges$dt[n2.side == "left" & type == "ALT", n2])
-            right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
-                                   gg$edges$dt[n2.side == "right" & type == "ALT", n2])
-
-            ## fix loose ends for these nodes to zero
-            vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
-                 ":="(lb = 0, ub = 0)]
-            vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
-                 ":="(lb = 0, ub = 0)]
-            vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
-                 ":="(lb = 0, ub = 0)]
-            vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
-                 ":="(lb = 0, ub = 0)]
-
-            if (verbose) {
-                message("Number of homologous loose ends: ",
-                        length(left.og.node.ids) + length(right.og.node.ids))
-            }
-        
+            ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
+            ##                      gg$edges$dt[n2.side == "left" & type == "ALT", n2])
+            ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
+            ##                       gg$edges$dt[n2.side == "right" & type == "ALT", n2])
+
+            ## ## fix loose ends for these nodes to zero
+            ## #' zchoo Thursday, Sep 02, 2021 11:13:44 AM
+            ## #' removed these constraints to see if feasibility improves
+            ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+            ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
+            ##      ":="(lb = 0, ub = 0)]
+
+            ## if (verbose) {
+            ##     message("Number of homologous loose ends: ",
+            ##             length(left.og.node.ids) + length(right.og.node.ids))
+            ## }
+            
             ## reciprocal homologous extremity exclusivity
             ## implement configuration indicators (OR constraint)
             config.dt = vars[type == "straight.config" | type == "cross.config",]
@@ -1066,7 +1156,7 @@ balance = function(gg,
                      .(value = 1, id, cid = paste("rhee", s))],
                 vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE,
                      .(value = 1, id, cid = paste("rhee", c))]
-                )
+            )
 
             ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge)
             ## rhomol.constraints[, n.entries := .N, by = cid]
@@ -1083,9 +1173,6 @@ balance = function(gg,
             b = rbind(b, rhs, fill = TRUE)
 
         }
-    }
-
-    if (phased) {
 
         ## add the edge indicator sum constraints (ISM consistency)
         iconstraints = unique(
@@ -1127,7 +1214,7 @@ balance = function(gg,
             b = rbind(b, rhs, fill = TRUE)
             
         }
-                
+        
 
         ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output)
         ## can become infeasible if original graph is not compatible with ISM
@@ -1159,39 +1246,6 @@ balance = function(gg,
             b = rbind(b, edge.indicator.b, fill = TRUE)
         }
 
-        ## REF edge configuration constraint (added by default basically)
-        ## only add this if there are no unphased nodes
-        if (cnloh) {
-            
-            ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2
-            ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges
-
-            ## penalize CNLOH edges
-
-            if (!is.null(gg$edges$dt$cnloh)) {
-                cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique
-                if (verbose) {
-                    message("Number of marked CNLOH edges: ", length(cnloh.edges))
-                }
-
-                ## add CNLOH annotation to variables
-                ## browser()
-                vars[, cnloh := FALSE]
-                vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") &
-                     ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges),
-                     ":="(cnloh = TRUE)]
-                
-            } else {
-                warning("CNLOH not specified on edges. Disallowing!")
-                cnloh.og.edges = c()
-                vars[, cnloh := FALSE]
-            }
-        } else {
-
-            cnloh.og.edges = c()
-            vars[, cnloh := FALSE]
-            
-        }
 
         ## add CNLOH constraints for applicable edges
         
@@ -1215,8 +1269,8 @@ balance = function(gg,
         ## b = rbind(b, rhs, fill = TRUE)
 
         ## add ISM constraints for ALL REF edges (as CNLOH is now marked as ALT)
-        #' zchoo Wednesday, Sep 01, 2021 10:27:47 AM
-        ## this set of constraints is no longer necessary as node haplotypes have been added
+        ## this should be no longer needed!!
+        ## this introduces redundant constraints for REF edges
         
         ## iconstraints.from = unique(
         ##     vars[type == "edge.indicator" & ref.or.alt == "REF", ##& !(og.edge.id %in% cnloh.og.edges),
@@ -1253,144 +1307,143 @@ balance = function(gg,
         
         ## ## add to b
         ## b = rbind(b, edge.indicator.b, fill = TRUE)
-
     }
 
-  if (L0) ## add "big M" constraints
-  {
-    ## indicator constraints ie on ulids 
-    iconstraints = rbind(
-      vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))],
-      vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))],
-      fill = TRUE)
-
-    ## add the matching indicator variables, matching to the cid from above
-    iconstraints = rbind(
-      iconstraints,
-      vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
-        match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)],
-      fill = TRUE)
-                         
-    ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
+    if (L0) ## add "big M" constraints
+    {
+        ## indicator constraints ie on ulids 
+        iconstraints = rbind(
+            vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))],
+            vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))],
+            fill = TRUE)
 
-    ## upper bound sense is 'L' i.e. less than because -M on left hand side
-    b = rbind(b,
-              vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))],
-              vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))],
-              fill = TRUE)
+        ## add the matching indicator variables, matching to the cid from above
+        iconstraints = rbind(
+            iconstraints,
+            vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
+                match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)],
+            fill = TRUE)
+        
+        ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
+        constraints = rbind(
+            constraints,
+            iconstraints,
+            fill = TRUE)
 
-    ## lower bound 0.1 if indicator positive, 0 otherwise
-    iconstraints = rbind(
-      vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))],
-      vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))],
-      fill = TRUE)
-    
-    ## add the matching indicator variables, matching to the cid from above
-    iconstraints = rbind(
-      iconstraints,
-      vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
-        match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)],
-      fill = TRUE)
-
-    ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-    constraints = rbind(
-      constraints,
-      iconstraints,
-      fill = TRUE)
+        ## upper bound sense is 'L' i.e. less than because -M on left hand side
+        b = rbind(b,
+                  vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))],
+                  vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))],
+                  fill = TRUE)
 
-    ## lower bound sense is 'G' i.e. greater than because -M on left hand side
-    b = rbind(b,
-              vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))],
-              vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))],
-              fill = TRUE)
+        ## lower bound 0.1 if indicator positive, 0 otherwise
+        iconstraints = rbind(
+            vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))],
+            vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))],
+            fill = TRUE)
+        
+        ## add the matching indicator variables, matching to the cid from above
+        iconstraints = rbind(
+            iconstraints,
+            vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][
+                match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)],
+            fill = TRUE)
 
-    if (loose.collapse)
-    {
-      ##################
-      ## loose indicator sum  = sum of indicators
-      ##################
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))],
-        vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))],
-        fill = TRUE)
-      
-      ## indicator sum is the sum of all indicators mapping to that loose end
-      iconstraints = rbind(
-          iconstraints,
-        unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
-          match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'),
-        fill = TRUE)
-      
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
-      
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))],
+        ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
+        constraints = rbind(
+            constraints,
+            iconstraints,
+            fill = TRUE)
+
+        ## lower bound sense is 'G' i.e. greater than because -M on left hand side
+        b = rbind(b,
+                  vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))],
+                  vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))],
+                  fill = TRUE)
+
+        if (loose.collapse)
+        {
+##################
+            ## loose indicator sum  = sum of indicators
+##################
+            iconstraints = rbind(
+                vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))],
+                vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))],
                 fill = TRUE)
-     
-      ##################
-      ## now we make new indicator variables on the sum of the individual loose end indicators      
-      ## upper bound bound 0.1 if indicator positive, 0 otherwise
-      ##################
-
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
-        vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
-        fill = TRUE)
+            
+            ## indicator sum is the sum of all indicators mapping to that loose end
+            iconstraints = rbind(
+                iconstraints,
+                unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
+                    match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'),
+                fill = TRUE)
+            
+            constraints = rbind(
+                constraints,
+                iconstraints,
+                fill = TRUE)
+            
+            b = rbind(b,
+                      vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))],
+                      vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))],
+                      fill = TRUE)
+            
+##################
+            ## now we make new indicator variables on the sum of the individual loose end indicators      
+            ## upper bound bound 0.1 if indicator positive, 0 otherwise
+##################
 
-      ## add the matching indicator variables, matching to the cid from above
-      iconstraints = rbind(
-        iconstraints,
-        vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][
-          match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)],
-        fill = TRUE)
-      
-      ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
+            iconstraints = rbind(
+                vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
+                vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
+                fill = TRUE)
 
-      ## upper bound sense is 'L' i.e. less than because -M on left hand side
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
+            ## add the matching indicator variables, matching to the cid from above
+            iconstraints = rbind(
+                iconstraints,
+                vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][
+                    match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)],
+                fill = TRUE)
+            
+            ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
+            constraints = rbind(
+                constraints,
+                iconstraints,
                 fill = TRUE)
 
-      ## lower bound 0.1 if indicator positive, 0 otherwise
-      iconstraints = rbind(
-        vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
-        vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
-        fill = TRUE)
-      
-      ## add the matching indicator variables, matching to the cid from above
-      iconstraints = rbind(
-        iconstraints,
-        vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
-          match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)],
-        fill = TRUE)
+            ## upper bound sense is 'L' i.e. less than because -M on left hand side
+            b = rbind(b,
+                      vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))],
+                      vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))],
+                      fill = TRUE)
 
-      ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
-      constraints = rbind(
-        constraints,
-        iconstraints,
-        fill = TRUE)
+            ## lower bound 0.1 if indicator positive, 0 otherwise
+            iconstraints = rbind(
+                vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
+                vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
+                fill = TRUE)
+            
+            ## add the matching indicator variables, matching to the cid from above
+            iconstraints = rbind(
+                iconstraints,
+                vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][
+                    match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)],
+                fill = TRUE)
 
-      ## lower bound sense is 'G' i.e. greater than because -M on left hand side
-      b = rbind(b,
-                vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
-                vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
+            ## upper bounds "infinity" ie M if indicator positive, 0 otherwise
+            constraints = rbind(
+                constraints,
+                iconstraints,
                 fill = TRUE)
 
-    }    
-  }
+            ## lower bound sense is 'G' i.e. greater than because -M on left hand side
+            b = rbind(b,
+                      vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))],
+                      vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))],
+                      fill = TRUE)
+
+        }    
+    }
 
 
     if (!is.null(marginal) && length(dmarginal)) 
@@ -1432,12 +1485,12 @@ balance = function(gg,
         b = rbind(emb, b, fill = TRUE)
     }
 
-  ########
-  ## MAKE MATRICES
-  ########
+########
+    ## MAKE MATRICES
+########
 
-  ## now Rcplex time
-  ## remove any rows with b = NA
+    ## now Rcplex time
+    ## remove any rows with b = NA
 
     ## get rid of any constraints with NA values
     keep.constraints = intersect(b[!is.na(value), cid], constraints[!is.na(value), cid])
@@ -1464,50 +1517,50 @@ balance = function(gg,
     Amat = sparseMatrix(constraints$cid, constraints$id, x = constraints$value, dims = c(length(ucid), nrow(vars)))
     vars[is.na(weight), weight := 0]
 
-  if (verbose) {
+    if (verbose) {
 
-    message("bvec length: ", length(bvec))
-    message("Amat nrow: ", nrow(Amat))
+        message("bvec length: ", length(bvec))
+        message("Amat nrow: ", nrow(Amat))
 
-  }
-  if (any(ix <- is.infinite(vars$weight)))
-  {
-    warning('nodes with infinite weight, setting to 0, please check inputs')
-    vars[ix, weight := 0]
-  }
-  Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix')
+    }
+    if (any(ix <- is.infinite(vars$weight)))
+    {
+        warning('nodes with infinite weight, setting to 0, please check inputs')
+        vars[ix, weight := 0]
+    }
+    Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix')
 
-  ## set lambda to 0 at terminal or other non NA nodes
-  vars[is.na(lambda), lambda := 0]
+    ## set lambda to 0 at terminal or other non NA nodes
+    vars[is.na(lambda), lambda := 0]
 
- 
-  ## set cvec by multiplying global lambda by local lambda for non-terminal loose end
-  ## vars (or their indicators if L0 is TRUE)
-  if (L0)
+    
+    ## set cvec by multiplying global lambda by local lambda for non-terminal loose end
+    ## vars (or their indicators if L0 is TRUE)
+    if (L0)
     {
-      if (loose.collapse)
-      {
-          cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator') & !terminal)] %>% as.numeric)
-          ## cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
+        if (loose.collapse)
+        {
+            cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator') & !terminal)] %>% as.numeric)
+            ## cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
         }
-      else
+        else
         {
-          cvec = lambda*(vars[, lambda * (type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
+            cvec = lambda*(vars[, lambda * (type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric)
         }
     } else {
-    cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric)
-  }
+        cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric)
+    }
 
-  ## message("CVEC: ", length(cvec))
+    ## message("CVEC: ", length(cvec))
 
-  ## implement reward if provided
-  if (length(ix <- which(vars$reward!=0)))
-  {
-    if (verbose)
-      message('Applying reward')
-    cvec[ix] = -vars$reward[ix]
+    ## implement reward if provided
+    if (length(ix <- which(vars$reward!=0)))
+    {
+        if (verbose)
+            message('Applying reward')
+        cvec[ix] = -vars$reward[ix]
 
-  }
+    }
 
     if (lp) {
         ## add weights of stuff
@@ -1534,12 +1587,12 @@ balance = function(gg,
     ## browser()
     ## vars[type == "edge.indicator" & cnloh == TRUE]
     ## vars[type == "edge.indicator" & cnloh == TRUE, .N, by = og.edge.id]
-  
+    
 
-  lb = vars$lb
-  ub = vars$ub
+    lb = vars$lb
+    ub = vars$ub
 
-  control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1, trelim = trelim, nodefileind = nodefileind)
+    control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1, trelim = trelim, nodefileind = nodefileind)
 
     ## call our wrapper for CPLEX
     sol =  Rcplex2(cvec,
@@ -1554,82 +1607,92 @@ balance = function(gg,
                    control = control,
                    tuning = FALSE)
     
-  vars$cvec = cvec
-  vars$x = sol$x
+    vars$cvec = cvec
+    vars$x = sol$x
 
-  ## for debugging
-  ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep=  '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char]
-  
-  ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc}
+    ## for debugging
+    ppc = function(x) (x %>% merge.data.table(vars, by = 'id') %>% merge.data.table(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep=  '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char]
+    
+    ppv = function(x) {tmp = x %>% merge.data.table(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc}
 
-  .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))),
-                                  sense)
-  chk = .check(sol$x)
+    .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))),
+                                    sense)
+    chk = .check(sol$x)
 
-  if (any(is.na(sol$x)))
-    stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output')
+    if (any(is.na(sol$x)))
+        stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output')
 
-  if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] |
-      chk[sense == 'G', any(obs < 0, na.rm = TRUE)] |
-      chk[sense == 'L', any(obs > 0, na.rm = TRUE)])
-    stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter')
+    if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] |
+        chk[sense == 'G', any(obs < 0, na.rm = TRUE)] |
+        chk[sense == 'L', any(obs > 0, na.rm = TRUE)])
+        stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter')
 
-   ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x
+    ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x
 
- 
-  ## update graph  
-  nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))]
-  emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))]
+    
+    ## update graph  
+    nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))]
+    emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))]
 
-  loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id]
-  looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id]
+    ## add haplotype label if available
+    if (phased) {
+        ## browser()
+        tmp = vars[type == "haplotype", .(sum = sum(x)), by = og.node.id]
+        ## vars[type == "haplotype", .(sum = sum(x)), by = og.node.id][sum != 1, og.node.id]
+        ## vars[og.node.id == 1 & type == "haplotype"]
+        haplotypes = vars[type == "haplotype", .(nid = abs(snode.id), haplotype = ifelse(x == 1, "h1", "h2"))]
+        gg$nodes[haplotypes$nid]$mark(haplotype = haplotypes$haplotype)
+    }
 
-  nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings
-  nodes$loose.left = loosei$cn>0
+    loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id]
+    looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id]
 
-  nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings
-  nodes$loose.right = looseo$cn>0
+    nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings
+    nodes$loose.left = loosei$cn>0
 
-  gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0)
-  gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn)
-  gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn)
+    nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings
+    nodes$loose.right = looseo$cn>0
 
-  ## cache old cn values
-  gg$nodes$mark(cn.old = gg$nodes$dt$cn)
-  gg$edges$mark(cn.old = gg$edges$dt$cn)
-  gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue
-  gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue
-  gg$nodes[nmark$nid]$mark(cn = nmark$cn)
-  gg$edges[emark$eid]$mark(cn = emark$cn)
-  gg$set(y.field = 'cn')
+    gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0)
+    gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn)
+    gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn)
 
-  gg$set(obj = sol$obj)
+    ## cache old cn values
+    gg$nodes$mark(cn.old = gg$nodes$dt$cn)
+    gg$edges$mark(cn.old = gg$edges$dt$cn)
+    gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue
+    gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue
+    gg$nodes[nmark$nid]$mark(cn = nmark$cn)
+    gg$edges[emark$eid]$mark(cn = emark$cn)
+    gg$set(y.field = 'cn')
 
-##  fix loose ends
-  nodes = gg$nodes 
-  nodes$loose.left = nodes$dt$loose.cn.left>0
-  nodes$loose.right = nodes$dt$loose.cn.right>0
+    gg$set(obj = sol$obj)
 
-  ## if phased, mark edges with different colors to make it easier to visualize
-  if (phased) {
-    if (verbose) {
-      message("formatting phased graph...")
+    ##  fix loose ends
+    nodes = gg$nodes 
+    nodes$loose.left = nodes$dt$loose.cn.left>0
+    nodes$loose.right = nodes$dt$loose.cn.right>0
+
+    ## if phased, mark edges with different colors to make it easier to visualize
+    if (phased) {
+        if (verbose) {
+            message("formatting phased graph...")
+        }
+        ## edge formatting
+        ref.edge.col = alpha("blue", 0.2)
+        alt.edge.col = alpha("red", 0.4)
+        ref.edge.lwd = 0.5
+        alt.edge.lwd = 1.0
+        edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col)
+        edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd)
+        gg$edges$mark(col = edge.col, lwd = edge.lwd)
+
+        ## mark zero cn edges
+        zero.cn.col = alpha("gray", 0)
+        zero.cn.lwd = 0.5
+        zero.cn.edges = which(gg$edges$dt$cn == 0)
+        gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd)
     }
-    ## edge formatting
-    ref.edge.col = alpha("blue", 0.2)
-    alt.edge.col = alpha("red", 0.4)
-    ref.edge.lwd = 0.5
-    alt.edge.lwd = 1.0
-    edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col)
-    edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd)
-    gg$edges$mark(col = edge.col, lwd = edge.lwd)
-
-    ## mark zero cn edges
-    zero.cn.col = alpha("gray", 0)
-    zero.cn.lwd = 0.5
-    zero.cn.edges = which(gg$edges$dt$cn == 0)
-    gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd)
-  }
     if (debug) {
         return(list(gg = gg, sol = sol))
     }    
@@ -2563,7 +2626,6 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) {
 
     ## reduce NA nodes
     na.nodes.gr = gg.nodes.gr %Q% (na.node == TRUE)
-    ## browser()
 
     ## if there are magically not any of these, return
     if (length(na.nodes.gr) == 0) {
@@ -2572,12 +2634,6 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) {
     
     na.nodes.gr = gr.reduce(na.nodes.gr, by = "na.node")
 
-    ## na.nodes.gr = gr.val(na.nodes.gr, target = gg$nodes$gr[, "nbins"],
-    ##                      val = "nbins",
-    ##                      weighted = FALSE,
-    ##                      FUN = sum,
-    ##                      na.rm = TRUE)
-
     ## only keep relatively large NA regions... for isolated small ROH we may not want to do this.
     na.nodes.gr = na.nodes.gr %Q% (width(na.nodes.gr) > min.width)
 
@@ -2661,6 +2717,37 @@ find_na_ranges = function(gg, min.bins = 1, min.width = 5e3, verbose = FALSE) {
     return(dt2gr(na.nodes.dt[, .(seqnames, start, end)], seqlengths = seqlengths(gg$nodes$gr)))
 }
 
+#' @name find_na_nodes_helper
+#' @title find_na_nodes_helper
+#'
+#' @ description
+#'
+#' An extremely simple function that calls ROH based on gaps between hets
+#'
+#' @param gg (gGraph)
+#' @param hets (GRanges)
+#' @param min.width (numeric) minimum ROH width
+#' @param phase.blocks (GRanges) NOT IMPLEMENTED YET
+#' @param verbose (logical) default 1
+#'
+#' @return GRanges representing nodes falling within ROH with width > min.width
+find_na_nodes_helper = function(gg, hets,
+                           min.width = 1e4,
+                           phase.blocks = NULL,
+                           verbose = TRUE) {
+
+    ## get unique GRanges for hets
+    unique.hets = gr.stripstrand(unique(hets[, c()]))
+    all.gaps = gaps(unique.hets)
+    big.gaps = all.gaps %Q% (strand(all.gaps)=="*" & width(all.gaps) > min.width)
+    big.gaps$roh.id = 1:length(big.gaps)
+    start.gaps = gr.start(gg$nodes$gr[, c()]) %$% big.gapsn
+    end.gaps = gr.end(gg$nodes$gr[, c()]) %$% big.gaps
+    unphased.nodes = which(start.gaps$roh.id == end.gaps$roh.id)
+    unphased.gr = gg$nodes[unphased.nodes]$gr[, c()]
+    return(unphased.gr)
+}
+
 #' @name unphase_na_ranges
 #' @title unphase_na_ranges
 #'
@@ -2800,6 +2887,7 @@ unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = N
 #'
 #' @param gg junction-balanced phased gGraph. each node must have metadata og.nodes.id, allele, nbins, cn
 #' @param min.bins (numeric) minimum number of bins to be marked as an NA node. default 1
+#' @param min.width (numeric) min width (bp) for a region to be considered a ROH
 #' @param phase.blocks (GRanges) granges of phase blocks from linked reads. default = NULL
 #' @param mc.cores (int) number of cores. default = 8.
 #' @param verbose (bool) verbose > 0 prints stuff. default 1.
@@ -2807,7 +2895,11 @@ unphase_na_ranges = function(gg, min.bins = 1, min.width = 5e3, phase.blocks = N
 #' @return balanced gGraph with unphased nodes marked and compressed
 #'
 #' @export
-phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores = 8, verbose = 1)
+phased.postprocess = function(gg,
+                              min.bins = 1,
+                              min.width = 1e4,
+                              phase.blocks = NULL,
+                              mc.cores = 8, verbose = 1)
 {
     ## check that gg nodes and edges have og node
     if (!("og.node.id" %in% colnames(gg$nodes$dt)) | !("og.edge.id" %in% colnames(gg$edges$dt))) {
@@ -2901,7 +2993,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
         }
 
     if (min.bins) {
-        na.gr = find_na_ranges(gg, min.bins = min.bins)
+        na.gr = find_na_ranges(gg, min.bins = min.bins, min.width = min.width)
         all.seed.gr = gr.reduce(c(seed.gr, na.gr))
     } else {
         na.gr = GRanges()
@@ -3112,7 +3204,7 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
 #' @param vbase.prop.thres (float) proportion of allele excess required to phase edges (default 0.9)
 #' @param min.bins (numeric) minimum number of bins for intra segment variance (default 3)
 #' @param min.var (numeric) min allowable variance (default 0.1)
-#' @param max.span (numeric) max span before penalizing CNLOH
+#' @param max.span (numeric) max span before penalizing CNLOH (NOT USED??)
 #' @param verbose (bool) default TRUE for debugging
 #' @param min.width (numeric) min allowable width for cnloh-adjacent node. default 1 Mbp
 #' @param mc.cores (int) number of cores
@@ -3127,9 +3219,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                            edge.phase.dt = NULL,
                            vbase.count.thres = 5, vbase.prop.thres = 0.9,
                            min.bins = 3, min.var = 1e-3,
-                           max.span = 1e6,
+                           max.span = 0,
                            min.width = 5e3,
-                           fix.tiny = TRUE,
+                           fix.tiny = FALSE,
                            verbose = TRUE, mc.cores = 8)
 {
     if (verbose) {
@@ -3236,7 +3328,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         gg$edges$mark(cnloh = FALSE)
         gg$edges[internal.edges]$mark(cnloh = TRUE)
 
-        browser()
+        ## browser()
 
         if (verbose) {
             message("Number of internal edges marked in parent graph: ", length(internal.edges))
@@ -3245,59 +3337,59 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
         gg$edges$mark(cnloh = FALSE)
     }
 
-    if (verbose) {
-        message("Marking pseudo-CNLOH")
-    }
-
-    ## pull intra-chromosomal ALT edges, because these are cnloh candidates
-    pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")]
-    pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt
-
-    if (nrow(pseudo.cnloh.junctions.dt)) {
-
-        ## grab span - limit to below max.span
-        pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span]
-
-        ## compute node overlap with shadow
-        ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr
-        ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap]
-
-        ## mark candidates
-        ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
-        ##                                                  class == "DEL-like" &
-        ##                                                  node.overlap.count <= 3, edge.id],
-        ##                        pseudo.cnloh.junctions.dt[span < max.span &
-        ##                                                  class == "INV-like" &
-        ##                                                  node.overlap.count <= 2, edge.id],
-        ##                        pseudo.cnloh.junctions.dt[span < max.span &
-        ##                                                  class == "DUP-like" &
-        ##                                                  node.overlap.count <= 1, edge.id])
-
-        #' zchoo Friday, Jul 30, 2021 11:31:11 AM
-        ## changed this to include any junction with span under max.span
-        pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
-                                                         class == "DEL-like", edge.id],
-                               pseudo.cnloh.junctions.dt[span < max.span &
-                                                         class == "INV-like", edge.id],
-                               pseudo.cnloh.junctions.dt[span < max.span &
-                                                         class == "DUP-like", edge.id])
+    ## if (verbose) {
+    ##     message("Marking pseudo-CNLOH")
+    ## }
+
+    ## ## pull intra-chromosomal ALT edges, because these are cnloh candidates
+    ## pseudo.cnloh.junctions = gg$junctions[class %in% c("DEL-like", "INV-like", "DUP-like")]
+    ## pseudo.cnloh.junctions.dt = pseudo.cnloh.junctions$dt
+
+    ## if (nrow(pseudo.cnloh.junctions.dt)) {
+
+    ##     ## grab span - limit to below max.span
+    ##     pseudo.cnloh.junctions.dt[, span := pseudo.cnloh.junctions$span]
+
+    ##     ## compute node overlap with shadow
+    ##     ## node.overlap = pseudo.cnloh.junctions$shadow %N% gg$nodes$gr
+    ##     ## pseudo.cnloh.junctions.dt[, node.overlap.count := node.overlap]
+
+    ##     ## mark candidates
+    ##     ## pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
+    ##     ##                                                  class == "DEL-like" &
+    ##     ##                                                  node.overlap.count <= 3, edge.id],
+    ##     ##                        pseudo.cnloh.junctions.dt[span < max.span &
+    ##     ##                                                  class == "INV-like" &
+    ##     ##                                                  node.overlap.count <= 2, edge.id],
+    ##     ##                        pseudo.cnloh.junctions.dt[span < max.span &
+    ##     ##                                                  class == "DUP-like" &
+    ##     ##                                                  node.overlap.count <= 1, edge.id])
+
+    ##     #' zchoo Friday, Jul 30, 2021 11:31:11 AM
+    ##     ## changed this to include any junction with span under max.span
+    ##     pseudo.cnloh.edges = c(pseudo.cnloh.junctions.dt[span < max.span &
+    ##                                                      class == "DEL-like", edge.id],
+    ##                            pseudo.cnloh.junctions.dt[span < max.span &
+    ##                                                      class == "INV-like", edge.id],
+    ##                            pseudo.cnloh.junctions.dt[span < max.span &
+    ##                                                      class == "DUP-like", edge.id])
         
-        ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id]
-        gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE)
+    ##     ## pseudo.cnloh.edges = pseudo.cnloh[span < max.span & type == "ALT", edge.id]
+    ##     gg$edges[pseudo.cnloh.edges]$mark(cnloh = TRUE)
 
-        if (verbose) {
-            message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges))
-        }
+    ##     if (verbose) {
+    ##         message("Number of pseudo-CNLOH edges marked:", length(pseudo.cnloh.edges))
+    ##     }
 
-    } else {
+    ## } else {
 
-        gg$edges$mark(cnloh = FALSE)
+    ##     gg$edges$mark(cnloh = FALSE)
 
-        if (verbose) {
-            message("No pseudo-CNLOH edges detected.")
-        }
+    ##     if (verbose) {
+    ##         message("No pseudo-CNLOH edges detected.")
+    ##     }
 
-    }
+    ## }
 
     
 
@@ -3390,6 +3482,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     dt = ov[, .(mean = mean(cn, na.rm = T), var = var(cn, na.rm = T), nbins = .N), by = query.id]
     ov.match = match(1:length(phased.gg.nodes), dt$query.id)
 
+    ## make sure that the variance is at least as large as the mean
+    dt[var < mean, var := mean]
+
     ## add information to nodes
     phased.gg.nodes$cn = dt$mean[ov.match]
     phased.gg.nodes$var = dt$var[ov.match]
@@ -3402,6 +3497,14 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
                                     (phased.gg.nodes$nbins / (2 * sqrt(phased.gg.nodes$var))),
                                     NA)
 
+    ## unset weights if far from marginal
+    marginal.dt = as.data.table(phased.gg.nodes)[, .(marginal.cn, og.node.id, cn, allele)]
+    marginal.dt[, delta.estimate := marginal.cn - sum(.SD$cn), by = og.node.id]
+    bad.major.nodes = which(marginal.dt[, delta.estimate] > 5 & marginal.dt[, allele == "major"])
+    ## browser()
+    phased.gg.nodes$weight[bad.major.nodes] = 0
+    phased.gg.nodes$cn[bad.major.nodes] = NA
+
     if (verbose) {
         message("Preparing phased gGraph edges")
     }
@@ -3545,14 +3648,6 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
             message("Fixing very tiny nodes")
         }
 
-        ## browser()
-        ## tiny.nodes = phased.gg$nodes$dt[width < min.width, node.id]
-        ## tiny.edges = phased.gg$edges$dt[(n1 %in% tiny.nodes | n2 %in% tiny.nodes) &
-        ##                                 type == "REF" &
-        ##                                 connection == "cross", edge.id]
-
-        ## phased.gg$edges[tiny.edges]$mark(ub = 0, lb = 0)
-
         tmp = phased.gg$junctions$dt[, .(edge.id, type, class, connection)]
         tmp[, span := phased.gg$junctions$span]
         
@@ -3583,6 +3678,9 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     phased.gg$nodes$mark(col = ifelse(phased.gg$nodes$dt$allele == "major", major.node.col, minor.node.col),
                          ywid = 0.8)
 
+    ## mark edge span
+    phased.gg$edges$mark(span = phased.gg$junctions$span)
+
     return(phased.gg)
 }
 

From bbc6ffdc224fb7cf9e877a735f45f946d11e7e7d Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Tue, 22 Mar 2022 18:33:44 -0400
Subject: [PATCH 32/35] remove some merge conflict symbols

---
 R/apps.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index 34c8faef..755cab5f 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -2175,7 +2175,6 @@ binstats = function(gg, bins, by = NULL, field = NULL, purity = gg$meta$purity,
   return(gg)
 }
 
-<<<<<<< HEAD
 #' @name find_na_ranges
 #' @title find_na_ranges
 #'
@@ -3077,7 +3076,7 @@ phased.binstats = function(gg, bins = NULL, purity = NULL, ploidy = NULL,
     phased.gg.edges[n1.chr == n2.chr & n1.allele == n2.allele, connection := "straight"]
     phased.gg.edges[n1.chr == n2.chr & n1.allele != n2.allele, connection := "cross"]
 
-<<<<<<< HEAD
+
     ## add phase block information to edges (for linked reads)
     if (!is.null(phase.blocks)) {
 

From 858c3a6f38d4aae5489825a17b184187d0dfdb05 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Thu, 21 Apr 2022 15:47:23 -0400
Subject: [PATCH 33/35] update binstats

---
 R/apps.R | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 188 insertions(+), 1 deletion(-)

diff --git a/R/apps.R b/R/apps.R
index fb883010..7452473b 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -442,6 +442,14 @@ balance = function(gg,
         }
     }
 
+    ## add telomeric annotation
+    qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips
+    term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome
+                -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end
+    term.out = -term.in ## out is reciprocal of in
+
+    ## annotate loose indicators with this
+    vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, TRUE, FALSE)]
 
     if (phased) {
         ## add allele information and og.node.id
@@ -2834,8 +2842,187 @@ phased.postprocess = function(gg, min.bins = 1, phase.blocks = NULL, mc.cores =
 #' Given a balanced unphased genome graph with field $cn populated with integer values
 #' Constructs a "melted" haplotype graph with fields $cn and $allele
 #'
+#' In addition the following fields are populated in the node metadata
+#' - cn
+#' - allele
+#' - weight
+#'
+#' And, the following fields are populated in the edge metadata:
+#' - connection (either straight or cross, depending on whether the edge goes from major to minor
+#' - n1.allele (either major or minor)
+#' - n2.allele (either major or minor)
+#' - n1.chr (n1 seqname)
+#' - n2.chr (n2 seqname)
+#'
 #' @param gg (gGraph) gGraph with field $cn populated with integer copy number
-#' @param hets
+#' @param bins (GRanges) with fields count.field and allele.field
+#' @param purity (numeric) sample purity, default gg$meta$purity
+#' @param ploidy (numeric) sample ploidy, default gg$meta$ploidy (not needed?)
+#' @param allele.field (character) the field in bins giving major/minor allele, default $allele
+#' @param count.field (character) the field in bins giving SNP read counts, default $count
+#' @param verbose (logical) print stuff? default FALSE
+#'
+#' @return melted gGraph
+phased.binstats = function(gg, bins,
+                           purity = gg$meta$purity,
+                           ploidy = gg$meta$ploidy,
+                           count.field = "count",
+                           allele.field = "allele",
+                           verbose = FALSE)
+{
+
+    ## helper function to compute ploidy at het SNPs
+    ## (might be slightly different from overall ploidy)
+    ## (due to uneven distribution of hets)
+    local.ploidy = function(gg, sites) {
+        sites = sites[, c()] %$% gg$nodes$gr[, "cn"]
+        return(mean(values(sites)[, "cn"], na.rm = TRUE))
+    }
+
+    ## transform relative CN (SNP counts) to absolute multiplicity
+    ## given purity and ploidy
+    reads.to.allele.cn = function(bins, count.field, purity, ploidy) {
+        y = values(bins)[[count.field]]
+        y.bar = 2 * mean(y, na.rm = TRUE)
+
+        ## purity and ploidy
+        alpha = purity
+        tau = ploidy
+
+        ## linear equation
+        denom = alpha * tau + 2 * (1 - alpha)
+        beta = (y.bar * alpha) / denom
+        gamma =(y.bar * (1 - alpha)) / denom
+
+        cn = (y - gamma) / beta
+        return(cn)
+    }
+
+    if (verbose) {
+        message("Checking inputs")
+    }
+    if (is.null(purity)) {
+        warning("Purity not provided, setting to 1.0")
+        purity = 1
+    }
+    if (is.null(ploidy)) {
+        warning("Ploidy not provided, setting to 2.0")
+        ploidy = 2
+    }
+    if (!inherits(bins, 'GRanges')) {
+        stop("bins must be GRanges")
+    }
+    if (!(count.field %in% names(values(bins)))) {
+        stop("count.field not found in bins metadata")
+    }
+    if (!(allele.field %in% names(values(bins)))) {
+        stop("allele.field not found in bins metadata")
+    }
+    allele.values = unique(values(bins)[[allele.field]])
+    if (!("major" %in% allele.values) | !("minor" %in% allele.values)) {
+        stop("allele.field must contain labels 'major' and 'minor'")
+    }
+
+    if (verbose) { message("Getting ploidy from gGraph") }
+    ploidy = local.ploidy(gg, bins %Q% (values(bins)[, allele.field] == "major"))
+
+    if (verbose) { message("Performing allelic rel2abs transformation") }
+    allele.cn.bins = bins[, c()]
+    values(allele.cn.bins)[, "acn"] = reads.to.allele.cn(bins, count.field, purity, ploidy)
+    values(allele.cn.bins)[, "allele"] = values(bins)[, allele.field]
+
+    ## STRETCH OUT THE BINS!
+    if (verbose) { message("Stretching out bins") }
+    allele.cn.bins = gr.stripstrand(allele.cn.bins)
+    bin.gaps = gaps(allele.cn.bins)
+    bin.gaps = bin.gaps %Q% (strand(bin.gaps) == "*")
+    bin.gaps = resize(bin.gaps, width = width(bin.gaps) + 1, fix = "start")
+    bin.gaps = gr.val(query = bin.gaps[, c()],
+                      target = allele.cn.bins %Q% (allele == "minor"),
+                      val = "acn",
+                      mean = TRUE,
+                      na.rm = TRUE)
+    
+    if (verbose) { message("Evaluating allelic bins over graph") }
+    cn.dt = gr.findoverlaps(query = gg$nodes$gr[, c()],
+                            subject = bin.gaps,
+                            return.type = "data.table")
+
+    cn.dt[, minor.cn := values(bin.gaps)[, "acn"][subject.id]]
+
+    ## collapse by node
+    collapsed.cn.dt = cn.dt[, .(minor.cn = mean(minor.cn, na.rm = TRUE),
+                                nsites = .N,
+                                variance = var(minor.cn, na.rm = TRUE)),
+                            by = query.id]
+                                
+    collapsed.cn.dt[, cn.total := values(gg$nodes$gr)[, "cn"][query.id]]
+    collapsed.cn.dt[, major.cn := cn.total - minor.cn]
+    collapsed.cn.dt[, corrected.variance := pmax(pmax(variance, minor.cn), 1)]
+    collapsed.cn.dt[, weight := nsites / sqrt(corrected.variance)]
+
+    ## duplicate ranges to make phased gGraph nodes
+    og.nodes.gr = gg$nodes$gr[, c()]
+    values(og.nodes.gr)[, "og.node.id"] = values(gg$nodes$gr)[, "node.id"]
+    pmt = match(1:length(og.nodes.gr), collapsed.cn.dt[, query.id])
+    values(og.nodes.gr)[, "cn.total"] = collapsed.cn.dt[, cn.total][pmt]
+    values(og.nodes.gr)[, "cn.high"] = collapsed.cn.dt[, major.cn][pmt]
+    values(og.nodes.gr)[, "cn.low"] = collapsed.cn.dt[, minor.cn][pmt]
+    values(og.nodes.gr)[, "weight"] = collapsed.cn.dt[, weight][pmt]
+
+    ## prepare nodes for melted graph
+    phased.gg.nodes = c(og.nodes.gr, og.nodes.gr)
+    values(phased.gg.nodes)[, "cn"] = c(values(og.nodes.gr)[, "cn.high"], values(og.nodes.gr)[, "cn.low"])
+    values(phased.gg.nodes)[, "allele"] = c(rep("major", length(og.nodes.gr)), rep("minor", length(og.nodes.gr)))
+
+    ## prepare edges for melted graph
+    phased.gg.edges = rbind(
+        gg$edges$dt[, .(n1, n2, n1.side, n2.side, type,
+                        og.edge.id = edge.id,
+                        n1.allele = "major",
+                        n2.allele = "major")],
+        gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2 = n2 + length(og.nodes.gr), type,
+                        n1.side, n2.side,
+                        og.edge.id = edge.id,
+                        n1.allele = "minor",
+                        n2.allele = "minor")],
+        gg$edges$dt[, .(n1, n2 = n2 + length(og.nodes.gr), type,
+                        n1.side, n2.side,
+                        og.edge.id = edge.id,
+                        n1.allele = "major",
+                        n2.allele = "minor")],
+        gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2, type,
+                        n1.side, n2.side,
+                        og.edge.id = edge.id,
+                        n1.allele = "minor",
+                        n2.allele = "major")]
+    )
+
+    ## add n1/n2 chromosome information
+    phased.gg.edges[, ":="(n1.chr = seqnames(phased.gg.nodes)[n1] %>% as.character,
+                           n2.chr = seqnames(phased.gg.nodes)[n2] %>% as.character)]
+
+    ## add edge connection type (straight/cross)
+    phased.gg.edges[n1.chr == n2.chr & n1.allele == n2.allele, connection := "straight"]
+    phased.gg.edges[n1.chr == n2.chr & n1.allele != n2.allele, connection := "cross"]
+
+    phased.gg = gG(nodes = phased.gg.nodes, edges = phased.gg.edges)
+
+    ref.edge.col = alpha("blue", 0.3)
+    alt.edge.col = alpha("red", 0.3)
+    ref.edge.lwd = 0.5
+    alt.edge.lwd = 1.0
+    phased.gg$edges$mark(col = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col),
+                         lwd = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd))
+
+    major.node.col = alpha("red", 0.5)
+    minor.node.col = alpha("blue", 0.5)
+    phased.gg$nodes$mark(col = ifelse(phased.gg$nodes$dt$allele == "major", major.node.col, minor.node.col),
+                         ywid = 0.8)
+
+    return(phased.gg)
+}    
+                           
 
 
 #' @name phased.binstats.legacy

From a8929eb8642449d9ad4189467647d3e87a3eaf03 Mon Sep 17 00:00:00 2001
From: Zi-Ning Choo <zining01@gmail.com>
Date: Mon, 30 May 2022 16:29:44 -0400
Subject: [PATCH 34/35] check in changes for phasing

---
 R/apps.R | 106 ++++++++++++++++++++++---------------------------------
 1 file changed, 42 insertions(+), 64 deletions(-)

diff --git a/R/apps.R b/R/apps.R
index ca14d751..5e9755a6 100644
--- a/R/apps.R
+++ b/R/apps.R
@@ -509,7 +509,7 @@ balance = function(gg,
                 ## browser()
                 vars[, cnloh := FALSE]
                 vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") &
-                     ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges),
+                     ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges) & (sedge.id > 0),
                      ":="(cnloh = TRUE)]
                 
             } else {
@@ -674,9 +674,9 @@ balance = function(gg,
     ## vars[type %in% c('node', 'edge'), lb := ifelse(is.na(lb), 0, pmax(lb, 0, na.rm = TRUE)]
     ## vars[type %in% c('node', 'edge'), ub := ifelse(is.na(ub), M, pmin(ub, M, na.rm = TRUE))]
     vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)]
-    
-    vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)]
 
+    ## reward shouldn't have to be positive
+    ## vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)]
 
 
     ## figure out junctions and nodes to fix
@@ -1089,42 +1089,44 @@ balance = function(gg,
         ## if (!phased) {
         ## extremity exclusivity (relevant for ALL graphs)
 
-        loose.constraints = rbind(
-            vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-            vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
-        )
+        if (ism) {
+            loose.constraints = rbind(
+                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
+                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
+            )
 
-        edge.constraints = rbind(
-            vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))],
-            vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))]
-        )
+            edge.constraints = rbind(
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))],
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))]
+            )
 
-        constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
+            constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE)
 
-        loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
-        edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
+            loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
+            edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid")
 
-        b = rbind(b, edge.b, loose.b, fill = TRUE)
+            b = rbind(b, edge.b, loose.b, fill = TRUE)
 
-        ## fix loose ends at zero if they coincide with a called junction
-        edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
-        edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
+            ## fix loose ends at zero if they coincide with a called junction
+            edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2]))
+            edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)]
 
-        loose.zeros = rbind(
-            vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
-            vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
-                 .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
-        )
+            loose.zeros = rbind(
+                vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))],
+                vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids,
+                     .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))]
+            )
 
-        loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
+            loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid")
 
-        constraints = rbind(constraints, loose.zeros, fill = TRUE)
-        b = rbind(b, loose.zeros.rhs, fill = TRUE)
+            constraints = rbind(constraints, loose.zeros, fill = TRUE)
+            b = rbind(b, loose.zeros.rhs, fill = TRUE)
+        }
     }
 
     if (phased) {
@@ -1162,31 +1164,6 @@ balance = function(gg,
                         nrow(rhs))
             }
 
-            ## grab node ids associated with ALT edges on the left
-            ## left.og.node.ids = c(gg$edges$dt[n1.side == "left" & type == "ALT", n1],
-            ##                      gg$edges$dt[n2.side == "left" & type == "ALT", n2])
-            ## right.og.node.ids = c(gg$edges$dt[n1.side == "right" & type == "ALT", n1],
-            ##                       gg$edges$dt[n2.side == "right" & type == "ALT", n2])
-
-            ## ## fix loose ends for these nodes to zero
-            ## #' zchoo Thursday, Sep 02, 2021 11:13:44 AM
-            ## #' removed these constraints to see if feasibility improves
-            ## vars[type == "loose.in.indicator" & (snode.id %in% left.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.out.indicator" & (snode.id %in% right.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.in" & (snode.id %in% left.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-            ## vars[type == "loose.out" & (snode.id %in% right.og.node.ids),
-            ##      ":="(lb = 0, ub = 0)]
-
-            ## if (verbose) {
-            ##     message("Number of homologous loose ends: ",
-            ##             length(left.og.node.ids) + length(right.og.node.ids))
-            ## }
-            
-            ## reciprocal homologous extremity exclusivity
-            ## implement configuration indicators (OR constraint)
             config.dt = vars[type == "straight.config" | type == "cross.config",]
             
             config.constraints.lt = rbind(
@@ -1256,11 +1233,6 @@ balance = function(gg,
                      .(value = 1, id, cid = paste("rhee", c))]
             )
 
-            ## filter constraints to only include things with >= 4 entries (e.g. must have an ALT edge)
-            ## rhomol.constraints[, n.entries := .N, by = cid]
-            ## remove this filter! due to some loose end violations!
-            ## rhomol.constraints = rhomol.constraints[n.entries > 3, .(value, id, cid)]
-
             rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid")
 
             if (verbose) {
@@ -1323,7 +1295,7 @@ balance = function(gg,
             }
             
             iconstraints = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE,
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0,
                      .(value = 1, id, og.edge.id,
                        edge.id = abs(sedge.id),
                        cid = paste("edge.indicator.sum.lb", og.edge.id))],
@@ -1336,7 +1308,7 @@ balance = function(gg,
                 fill = TRUE)
 
             edge.indicator.b = unique(
-                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE,
+                vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0,
                      .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))],
                 by = "cid"
             )
@@ -1551,7 +1523,6 @@ balance = function(gg,
 
     ## now Rcplex time
     ## remove any rows with b = NA
-
     ## get rid of any constraints with NA values
     keep.constraints = intersect(b[!is.na(value), cid], constraints[!is.na(value), cid])
     b = b[cid %in% keep.constraints,]
@@ -3140,6 +3111,13 @@ phased.binstats = function(gg, bins,
                         n2.allele = "major")]
     )
 
+    ## transfer edge metadata
+    cols = intersect(setdiff(names(gg$edges$dt), names(phased.gg.edges)), c("cnloh"))
+    if (length(cols)) {
+        pmt = match(phased.gg.edges[, og.edge.id], gg$edges$dt[, edge.id])
+        phased.gg.edges = cbind(phased.gg.edges, gg$edges$dt[pmt, ..cols])
+    }
+    
     ## add n1/n2 chromosome information
     phased.gg.edges[, ":="(n1.chr = seqnames(phased.gg.nodes)[n1] %>% as.character,
                            n2.chr = seqnames(phased.gg.nodes)[n2] %>% as.character)]

From b9359b1768419483d66e4a76e8de0197a35e341f Mon Sep 17 00:00:00 2001
From: SplitInf <Bann13@gmail.com>
Date: Tue, 24 May 2022 23:56:00 -0400
Subject: [PATCH 35/35] Update converters.R

add break condition when reached the end of bedpe file
---
 R/converters.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/converters.R b/R/converters.R
index f406049b..89587950 100755
--- a/R/converters.R
+++ b/R/converters.R
@@ -937,6 +937,7 @@ read.juncs = function(rafile,
             while (grepl("^((#)|(chrom)|(chr))", thisline)) {
                 headers = c(headers, thisline)
                 thisline = readLines(f, 1)
+                if(length(thisline)==0) break
             }
             ln = sum(length(headers), length(thisline))
             while (length(thisline) > 0) {