From b6dd8aea38c504528a4e5f367e3ead9b1e85f64e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 24 Sep 2019 23:37:30 -0700 Subject: [PATCH 001/187] updating .travis.yml to do a doc build --- .travis.yml | 14 ++ doc-requirements.txt | 2 +- doc/Makefile | 5 + doc/source/learning/Basic_example.Rmd | 106 ++++++++ doc/source/learning/Basic_example.ipynb | 199 +++++++++++++++ doc/source/learning/Full_model_LASSO.Rmd | 145 +++++++++++ doc/source/learning/Full_model_LASSO.ipynb | 276 +++++++++++++++++++++ doc/source/learning/Learning1.Rmd | 26 -- doc/source/learning/Learning1.ipynb | 63 ----- doc/source/learning/Learning2.Rmd | 26 -- doc/source/learning/Learning2.ipynb | 63 ----- doc/source/learning/index.rst | 4 +- selectinf/learning/core.py | 25 +- selectinf/learning/learners.py | 8 +- selectinf/learning/utils.py | 82 +++--- 15 files changed, 812 insertions(+), 232 deletions(-) create mode 100644 doc/source/learning/Basic_example.Rmd create mode 100644 doc/source/learning/Basic_example.ipynb create mode 100644 doc/source/learning/Full_model_LASSO.Rmd create mode 100644 doc/source/learning/Full_model_LASSO.ipynb delete mode 100644 doc/source/learning/Learning1.Rmd delete mode 100644 doc/source/learning/Learning1.ipynb delete mode 100644 doc/source/learning/Learning2.Rmd delete mode 100644 doc/source/learning/Learning2.ipynb diff --git a/.travis.yml b/.travis.yml index 11e16d88b..881190701 100644 --- a/.travis.yml +++ b/.travis.yml @@ -69,6 +69,12 @@ matrix: env: - INSTALL_TYPE=requirements - DEPENDS= + - python: 3.6 + sudo: true + dist: trusty + env: + - DOC_BUILD=1 + before_install: - source travis-tools/utils.sh - travis_before_install @@ -84,6 +90,14 @@ before_install: install: # Install selectinf + - | + echo "backend : agg" > matplotlibrc + if [ "$DOC_BUILD" ]; then # doc build + pip install -r doc-requirements.txt + cd doc + jupytext --sync source/*/*.ipynb + # Build without the API documentation, for the doctests + make html - if [ "$RUN_R_TESTS" ]; then sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; diff --git a/doc-requirements.txt b/doc-requirements.txt index 864bedd86..37dc7d0d8 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -6,9 +6,9 @@ numpydoc matplotlib texext nb2plots -rpy2 seaborn statsmodels tensorflow keras nbsphinx +jupytext diff --git a/doc/Makefile b/doc/Makefile index 1f39aad81..7e84d387b 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -122,3 +122,8 @@ doctest: @echo @echo "The overview file is in build/doctest." +github: html + # Needs ghp-import (pip install ghp-import) + ghp-import -n -p $(BUILDROOT)/html/ + @echo + @echo "Published to Github" diff --git a/doc/source/learning/Basic_example.Rmd b/doc/source/learning/Basic_example.Rmd new file mode 100644 index 000000000..e57d8d571 --- /dev/null +++ b/doc/source/learning/Basic_example.Rmd @@ -0,0 +1,106 @@ +--- +jupyter: + jupytext: + cell_metadata_filter: all,-slideshow + formats: ipynb,Rmd + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Simple example + +Here we run a simple linear regression model (even without intercept) +and make a selection when the $Z$ score is larger than 2. + +The functions `partial_model_inference` and `pivot_plot` below are just simulation utilities +used to simulate results in least squares regression. The underlying functionality +is contained in the function `selectinf.learning.core.infer_general_target`. + + +```{python collapsed=TRUE} +import functools + +import numpy as np, pandas as pd +import matplotlib.pyplot as plt +# %matplotlib inline + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import partial_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler +from selectinf.learning.Rfitters import logit_fit +``` + +```{python} +np.random.seed(0) # for replicability +def simulate(n=20, p=1, s=1, signal=1, sigma=2, alpha=0.1, B=2000): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + sampler = normal_sampler(S, covS) + + def base_algorithm(X, dispersion, sampler): + + success = np.zeros(p) + + scale = 0. + noisy_S = sampler(scale=scale) + + Z = noisy_S / np.sqrt(np.linalg.norm(X)**2 * dispersion) + if Z > 2: + return set([0]) + else: + return set([]) + + selection_algorithm = functools.partial(base_algorithm, X, dispersion) + + # run selection algorithm + + return partial_model_inference(X, + y, + truth, + selection_algorithm, + sampler, + B=B, + fit_probability=logit_fit, + fit_args={'df':20}) +``` + +```{python} +dfs = [] +for i in range(1000): + df = simulate() + if df is not None: + dfs.append(df) +``` + +```{python} +fig = plt.figure(figsize=(8, 8)) +results = pd.concat(dfs) +pivot_plot(results, fig=fig); +``` + +```{python collapsed=TRUE} + +``` diff --git a/doc/source/learning/Basic_example.ipynb b/doc/source/learning/Basic_example.ipynb new file mode 100644 index 000000000..6b9989c17 --- /dev/null +++ b/doc/source/learning/Basic_example.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simple example\n", + "\n", + "Here we run a simple linear regression model (even without intercept) \n", + "and make a selection when the $Z$ score is larger than 2.\n", + "\n", + "The functions `partial_model_inference` and `pivot_plot` below are just simulation utilities\n", + "used to simulate results in least squares regression. The underlying functionality\n", + "is contained in the function `selectinf.learning.core.infer_general_target`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import functools\n", + "\n", + "import numpy as np, pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "from selectinf.tests.instance import gaussian_instance\n", + "\n", + "from selectinf.learning.utils import partial_model_inference, pivot_plot\n", + "from selectinf.learning.core import normal_sampler\n", + "from selectinf.learning.Rfitters import logit_fit" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", + " from numpy.core.umath_tests import inner1d\n", + "Using TensorFlow backend.\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:455: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:456: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:457: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:458: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:459: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:462: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", + "R[write to console]: Loaded gbm 2.1.5\n", + "\n", + "R[write to console]: randomForest 4.6-14\n", + "\n", + "R[write to console]: Type rfNews() to see new features/changes/bug fixes.\n", + "\n" + ] + } + ], + "source": [ + "np.random.seed(0) # for replicability\n", + "def simulate(n=20, p=1, s=1, signal=1, sigma=2, alpha=0.1, B=2000):\n", + "\n", + " # description of statistical problem\n", + "\n", + " X, y, truth = gaussian_instance(n=n,\n", + " p=p, \n", + " s=s,\n", + " equicorrelated=False,\n", + " rho=0.5, \n", + " sigma=sigma,\n", + " signal=signal,\n", + " random_signs=True,\n", + " scale=False)[:3]\n", + "\n", + " dispersion = sigma**2\n", + "\n", + " S = X.T.dot(y)\n", + " covS = dispersion * X.T.dot(X)\n", + " sampler = normal_sampler(S, covS)\n", + "\n", + " def base_algorithm(X, dispersion, sampler):\n", + "\n", + " success = np.zeros(p)\n", + "\n", + " scale = 0.\n", + " noisy_S = sampler(scale=scale)\n", + " \n", + " Z = noisy_S / np.sqrt(np.linalg.norm(X)**2 * dispersion)\n", + " if Z > 2:\n", + " return set([0])\n", + " else:\n", + " return set([])\n", + "\n", + " selection_algorithm = functools.partial(base_algorithm, X, dispersion)\n", + "\n", + " # run selection algorithm\n", + "\n", + " return partial_model_inference(X,\n", + " y,\n", + " truth,\n", + " selection_algorithm,\n", + " sampler,\n", + " B=B,\n", + " fit_probability=logit_fit,\n", + " fit_args={'df':20})" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonathantaylor/git-repos/selectinf/selectinf/distributions/discrete_family.py:86: RuntimeWarning: divide by zero encountered in log\n", + " self._lw = np.array([np.log(v) for v in xw[:,1]])\n" + ] + } + ], + "source": [ + "dfs = []\n", + "for i in range(1000):\n", + " df = simulate()\n", + " if df is not None:\n", + " dfs.append(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfoAAAHpCAYAAABqV/58AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3hUxdfA8e+kklClhCZVuliJiAUI\nhN6rIIK0nyDSXiwgIE1BsKIoxaBURZDeexUQAUWQJkU60jsJIWXeP24ISXaTbDbb93yeJw/ZuXPv\nHgzx7J05d0ZprRFCCCGEZ/JxdgBCCCGEsB9J9EIIIYQHk0QvhBBCeDBJ9EIIIYQHk0QvhBBCeDBJ\n9EIIIYQH83N2APaQN29eXbx4cWeHIYQQQjjEH3/8cUVrnc/cMY9M9MWLF2f37t3ODkMIIYRwCKXU\nqdSOydC9EEII4cEk0QshhBAeTBK9EEII4cEk0QshhBAeTBK9EEII4cE8sureErdu3eLSpUvExMQ4\nOxRhBX9/f0JCQsiRI4ezQxFCCJfmlYn+1q1bXLx4kcKFCxMUFIRSytkhiQzQWhMVFcW5c+cAJNkL\nIUQavHLo/tKlSxQuXJjg4GBJ8m5IKUVwcDCFCxfm0qVLzg5HCCFcmlcm+piYGIKCgpwdhsikoKAg\nmXoRQoh0eGWiB+RO3gPIz1AIIdLntYleCCGE8AaS6IUQQggPJoneTQ0fPpy8efM6Owy72b9/P0op\nNm3a5OxQhBDCrUmiF0IIITyYUxO9UmqKUuqSUmp/KseVUmqcUuqYUmqfUupZR8cokouJiSEuLs7Z\nYQghhNsaNw5Wr3bc+zl7wZxpwLfAjFSO1wdKJ3w9D0xM+FOk49q1a7z//vssXryYmzdv8uyzzzJ2\n7Fief/7hf74vvviC2bNnc+TIEbJkyULlypUZO3YspUqVSuwTFhZG3rx5qVOnDp988gknT57k5MmT\n/PDDD3z77besXbuWHj16sG/fPsqWLcu4ceOoWrVqsli+//57xo4dy7FjxyhQoAA9e/akf//+yfpM\nmDCB0aNHc+3aNWrWrEmfPn3s+x9I2NbZszB4MPz9N8THOzsaIVzWrdtQ7V/j+4sFIeSVGqivxtr1\nPZ2a6LXWW5RSxdPo0hSYobXWwA6lVC6lVEGt9X+2jMOVntLSOvPXiI6OplatWty4cYPPPvuMkJAQ\nJk6cSK1atTh69CgFChQA4OzZs/Tq1YtixYpx69YtJk2axIsvvsjRo0fJmTNn4vW2bdvG8ePH+eST\nTwgODk48FhkZSceOHenXrx8FChRgxIgRtGjRglOnThEcHAzAZ599xqBBg+jfvz9hYWH88ccfDBky\nhODgYHr16gXA4sWL6dmzJ2+++SbNmjVj8+bNdOnSJfP/IYRjXLwIL7xgJHshRJr8gdxAUYD/YOfc\nElQYCdmy2fFNtdZO/QKKA/tTObYMeDnJ6/VAaHrXrFSpkk7LwYMHk7020qtrfFlq2LBhOk+ePGaP\nff/999rf318fOXIksS0mJkaXLFlSv/vuu2bPiY2N1ZGRkTpbtmx6+vTpie3Vq1fXWbJk0RcuXDB5\nf0CvX78+sW3Pnj0a0CtXrtRaa33z5k2dNWtWPXz48GTnDhkyROfPn1/HxsZqrbV+7rnndL169ZL1\n+d///qcBvXHjxjT/O6T8WQoHi47W+uWXnf+LI1/y5QZfd0DXBF0U9MmEtgU00198kflfRWC31uZz\noscU4ymluimldiuldl++fNnZ4TjVunXrqFSpEiVKlCA2NpbY2FgAqlevzu7duxP77dixg9q1a5Mn\nTx78/PwIDg7mzp07HDlyJNn1KlWqRP78+U3eJyAggLCwsMTXFSpUAIyRAoDffvuNu3fv0rp168Q4\nYmNjqVmzJhcvXuTs2bPExsby559/0rRp02TXbtGihU3+Wwg769cPtm51dhRCuLzbGHPRG4DTQA3g\nFhASAn372ve9nT1Hn55zQJEkrx9NaDOhtY4AIgBCQ0O1/UNzXVeuXGHHjh34+/ubHHvssccAOH36\nNHXq1KFy5cp89913FCpUiICAABo2bMi9e/eSnWMuyQNkz54dH5+HnxUDAgIAEs+/cuUKAI8//rjZ\n88+cOUNgYCBxcXGEhIQkO5bytXBBU6bAhAnOjkIIl3cTI8n/lqTtDSAH8Fwo+Pra9/1dPdEvAXop\npWZjFOHd1Daen/dEuXPnJjQ0lIkTJ5ocCwwMBGDVqlVERkayePFismbNCkBsbCzXrl0zOcfapWZz\n584NwLJly8x+WChbtixBQUH4+vqabE4jm9WkY9EimDULnDl6tX27aZufH/z8MyQp6BTCkx0+DPPm\nwcmT5o9Hx95k/t63iOJgYlsh3mZJlg60mgGlQ3OaP9GGnJrolVI/A2FAXqXUWWAYRq0CWutJwAqg\nAXAMiAQ62yMO7WH3/+Hh4axZs4aiRYumemccFRWFj48Pfn4P/wn88ssvicP8tvDCCy8QFBTE+fPn\nadiwYar9nnnmGRYvXsybb76Z2LZgwQKbxeFxFi+G5s2dHYV5X34JrVo5OwohHOLUKXipJ5i5P0pw\nBSNtHUzS9g3n6cXY6VC6td1DBJxfdf9qOsc10NNB4bid+/fvM2/ePJP2+vXrM2nSJMLCwnj33Xcp\nWbIkV69eZefOnRQoUIB+/fpRs2ZN4uLi6Ny5M127duXAgQN8/vnn5MqVy2bx5cqVi+HDh9O3b19O\nnTpFtWrViI+P58iRI2zcuJGFCxcCMGjQIFq0aEGPHj1o3rw5mzdvZtWqVTaLw6NoDQMGODsK8zp2\nhIQnKYTwBp98klaSvwjUApIuE/Md0I0BA+CVV+wd3UOuPnQv0nD79m1atzb9SLhx40Y2btzI0KFD\nGTZsGBcvXiQkJITKlSvTpEkTAJ544gmmTZvG8OHDWbhwIU899RRz586lTZs2No2xf//+FCpUiLFj\nx/LFF1+QJUsWypQpk+x9mjdvzjfffMOYMWOYPn06YWFh/PDDD9StW9emsXiEX3+Ff/5xdhSmnnsO\nJk1yrWdVhbCju3fhxx9TOxoPNOJhklfAD0Bn6taFUaMcEGASSnvauDVGMV7S6vKUDh06RPny5R0Y\nkbAXr/tZtm8PP/3k7CiSe+opWL4cChd2diRCOMwPP8D//pdWjw1AQ+A+xppwr1G1qlFek1C+ZFNK\nqT+01qHmjskdvRDu4upVo+onpVGjjAVrnOGRR6BiRaMITwgvEhFh2taqFbz11oNXNdm1azF3794k\nLKw1ISFQvjz4OOGhdvntFMJdzJgB0dHJ2/LkgXfegYSnKYQQ9vfXX7BzZ8pWzYABitAk99Q1atRx\nZFip8pgFc4TwaFrDd9+ZtnfqJEleCAczvZv/h6xZXyZPnpNOiCZ9kuiFcAepFeF16+b4WITwYnfu\npCzCOwhU5+7d7YSH1+TMmTNOiix1kuiFcAfmJgTDwqBMGYeHIoQ3mzMHbt9+8GofxlIwFwG4ePEi\nJ1NbOceJZI5eCFeXWhFe9+6Oj0UID7N7N/zyC1y/bln/desefPcnUBswHqTPli0bK1asMNmm2xVI\nohfC1UVEmBbh5c3ruqvjCeEmFi82KuUzviDoTqAucAOAbNlysnbtKqpUqWLjCG1DEr0Qruy332DY\nMNN2KcITIlMOHjSWpch4kt+GsUWNMX7v6/sImzatpVKlSjaO0HZkjl4IV3X+PLRsCTExpsfeeMPx\n8QjhIW7cgGbNjMK6jNmEcSf/YJI+LwMHbnTpJA+S6N3W8OHDUUqZXSa2VatWyfaJT8/JkydRSrFs\n2TIbRigyJTraGFP8z8xmjW++KUV4QlgpLg5eew2OHs3omXsw9li7m/A6Py+9tInhw5+yaXz2IEP3\nbm7NmjXs2rWL5557zuprFCxYkN9++41y5crZMDKRKX36GMP2KT3/PHz1lePjEcJDDBsGK1aYtpcr\nB2+/nfp5sbEViYioy19/LSJXrkJMmLCBV14pa/e95G1BEr0by507N4ULF2bUqFEsWrTI6usEBga6\nbBGJV4qIMP84XYECMH++zM0LYaUFC8xvKJMjh7EGfdmyaZ3tT5cus+nduzf9+/enVKlS9grT5mTo\n3o0ppRg8eDBLlizh77//Ntvnv//+o0uXLpQsWZKgoCDKlCnDBx98wP379xP7pBy679Spk9kRgvHj\nxxMcHMzthIdI4+PjGTNmDKVKlSIwMJAyZcowffp0O/xNvcj27ea3evX3Nx6xk41jhLDKgQPw+uum\n7UoZ+0SlneQNgYGBREREuFWSB0n0bq9169aULl2aUanse3jlyhVy587Nl19+yapVq3jvvfeYOnUq\nvXv3TvWabdq0Yffu3Zw4cSJZ+5w5c2jQoAHZs2cHoHfv3owcOZJu3bqxfPlymjdvTpcuXWSu31pp\nFd+NGwcvveT4mITwANevG8V3d++aHhsxAho1Mm2fNWsWQ4YMwSN2eNVae9xXpUqVdFoOHjyYvMFY\nSdw1viw0bNgwnSdPHq211lOnTtU+Pj76n3/+0Vpr3bJlS129enWz58XExOiffvpJBwYG6ujoaK21\n1idOnNCAXrp0aWKfPHny6NGjRyeed/bsWa2U0nPnztVaa3306FGtlNLTpk1Ldv0OHTro0NBQi/8e\nmWXys3RX9+5pXaWK+X8Tb7yhdXy8syMUwi3Fxmpdv775X61mzbSOizM9Z9q0aVoppQE9YsQIxwdt\nBWC3TiUnyhy9B2jfvj0jRoxg9OjRTJ06NdkxrTVff/01ERERnDhxgnv37iUeO336tNkhKD8/P1q0\naMGcOXN4//33AZg7dy5Zs2alYcOGAKxfvx4fHx+aN29ObJIHUcPDw/n555+Ji4vD1x2qVBzl999h\n61bzd+tgFN7t2GHaXqUKfPONMb4ohMiwoUNh5UrT9vLljQ0hU24bO3nyZLp37554Jz937lzeeecd\nsmbN6oBo7UMSvQfw8/Ojf//+9OnTh+HDhyc79tVXX/Hee+8xYMAAqlevziOPPMKuXbvo2bNnsqSf\nUtu2bZk8eTJHjhyhTJkyzJkzhyZNmhAUFAQYUwJxcXHkzJnT7Pn//fcfjz76qM3+jm5t9GgYNCjj\n50nxnRCZsmoVfPyxafuD4ruEWchE48ePp1eSGpmnn36atWvXunWSB0n0HqNLly6MHDmSTz75JFn7\n3LlzadWqVbI5/IMHD6Z7verVq5M/f37mzJnD66+/zo4dOxg4cGDi8dy5c+Pn58e2bdvwSfmRGAgJ\nCcnE38aDzJ9vXZL39zfOLVTI9jEJ4SU++si07UHxXcqlKL788kveeeedxNehoaGsXr2a3Llz2zlK\n+5NE7yECAwN59913GThwIJUqVcLf3x+AqKgoAlPcEf7000/pXs/X15fWrVszZ84csmTJQq5cuahX\nr17i8Zo1axIXF8fNmzepXbu2bf8ynmL/fujY0bpzv/kGXnzRtvEI4UWuXzc/G2au+G7MmDHJbmSq\nVKnCqlWrUh2xdDdSdQ/OLr9L/pUJ3bt3J3v27Gzfvj2xrXbt2syZM4cJEyawevVqXn/9dY4dO2bR\n9dq0acOBAwcYO3YszZo1IyAgIPFY2bJlefPNN2nbti2ffPIJ69evZ/ny5Xz66af873//y9TfwyOk\nVeabnj59ZGc6ITJp40aIj0/eVrIkDB788LXWmg8//DBZkq9atSpr1qzxmCQPckfvUYKDg+nXrx+D\nk/xLHjp0KJcvX+aDDz4AoEWLFowbN47GjRune72XXnqJIkWKcObMGdq2bWtyfPz48ZQpU4bJkycz\ndOhQcuTIQYUKFejatavt/lLu4s8/ja+4OOP1L7/A8eOm/apVgxdeMH8Nf3/jWP369otTCC/xcDvZ\nh+rUSV58N2rUKIYl2TSqRo0aLF261O3n5FNSOpN3ka4oNDRU7969O9Xjhw4donz58g6MSNiLS/ws\nBw82X/GTUvnyRvV9ygogIYTNlS4NKQcv58+HFi0evt66dSv16tXj7t271KlTh4ULFxIcHOzYQG1E\nKfWH1jrU3DEZuhciM374wbIknzOn+TJfIYTNnTxpmuSVgho1kre9/PLLLFu2jNatW7N48WK3TfLp\nkaF7Iaz1++/w1lvp90utzFcIYRfr15u2hYbCI4+YtoeFhWVot093JHf0QljjwgVjDDDJngGpGjMG\nEhYaEkLY39q1pm3h4XH079+foxnfn9btyR29EJY4cAAOHXr4ZMTXXxtr06dUrZqx3yVAcDDUqwd1\n6zouTiG8XHy8uTv6WHbv7sy6dT/y888/s2XLFkqUKOGM8JxCEr0Q6RkwAD79NP1+oaGwejVkyWL/\nmIQQZu3bB1euJG2Jwde3PevW/QLA2bNniYiIYPTo0U6Jzxm8NtFrrVGyfrhbc8gTI4cPW5bkQ0KM\nza4lyQvhVMmH7e8DbYmLW5jY0r1791R3+/RUXjlH7+/vT1RUlLPDEJkUFRWVuAKg3Sxfnn4fPz+Y\nOxeKFLFvLEKIdD18fv4e0AJ4mOR79+7NxIkTzS7b7cm862+bICQkhHPnzhEZGemYu0JhU1prIiMj\nOXfunP3X1DdX1ZPSuHHG3LwQwqnu3YNffwWIBJoCDz+ov/vuu3z99ddeOZLrlUP3OXLkAOD8+fPE\npLZtqHBp/v7+5M+fP/FnaRfR0bBli2l7w4YQFGQ8G9+ypaxkJ4QD3b8PO3caST2lgwchKuou0BjY\nmNg+aNBgRo78yCuTPHhpogcj2ds1SQj399tvkHKKJ29eWLLEdBNrIYTd7dhhPMRy61ZqPe4A9YGt\niS0VK45g1KihDojOdXltohciXeYfxpUkL4QTxMTAK6+kleQBsgCFk7wezf/93/v2DcwNSKIXIjXm\ndsWoVcvxcQghWLYMzpxJr5cfMBOIAV4G+smvLJLohTDv+nUwtzGS/F9DCKf47jtLe/oDcwEfmjaF\nYsXsF5O7kEQvhDnmNrN+7DEoXtwp4QjhzU6cgDVrTNtDQy9y48ZMihZ9J1mhnZ+fD88/b6x1JSTR\nC2GeuWH72rUdH4cQgu+/f7j69ANFipznzp1wjh07TMuWVxg9erTXVtWnR6qKhDBH5ueFcAkxMTBl\nSsrWM0RGVufw4cMAfP755/z9998Oj81dSKIXIqVTpyDlDlfmNrMWQtjd0qXGZpEPnQCqcfWqseG8\nn58fs2fP5sknn3RGeG5Bhu6FSMnc3XxoKOTO7fhYhPByERFJXx0DagJG+b2/vz9z586ladOmTojM\nfUiiFyIlGbYXwmni4+HsWYiLg4sXkxbhHQbCAWN76MDAQBYuXEh9WZkyXZLohUgqPl4SvRBOsn49\nvPaakeCT2w/UAowDQUFBLFmyhFrye2kRSfRCJLVnT8rNrI117V980TnxCOEl9u6Fxo1NV52GvRhJ\n3vi9DAjIyooVywgLC3NsgG5MEr0QSU2datpWtarsMy+EHV29Cs2amUvyYOwpfz/h++zMnbuSsLCX\nHBecB5CqeyEeiIyEmTNN25s1c3wsQniJ2Fho2xZOnkytx3PASuBRWrdeS5MmkuQzSu7ohXhgzhzT\nHTOCg6FdO+fEI4QXGDTIfFlMjhwPH3TJnv1F6tY9xqhRgY4NzkNIohfigeTP8RhefdXYd14IYXNz\n5sBnn6Vs3USOHCH8+WcFHnssabskeWvJ0L0QAPv2GZtdp9Stm+NjEcIL3LoFPXqkbF0D1MfXN5y4\nuCNOiMozSaIXAszfzT/9NDz3nONjEcILzJplbBL50HKgMXCP69cv8Prrr6NTLnAvrCKJXojUivC6\ndTOWvhVC2JTWKbedXQQ050F1fdGiRfnpp59kkxobkUQvRGpFeK+95px4hPBwu3fDX389eDUXaA3E\nAFCiRAm2bNnCY8kn6EUmSKIXIrUivBw5HB+LEF7g4d38T0BbIBaA0qVLs2XLFooVK+akyDyTJHrh\n3ZYtM1+E172742MRwgvcugU//wwwFegAxANQsGB5Nm/ezKOPPurE6DyTJHrhvY4cMT88//TTxm51\nQgib++kniIz8DugCGMV2Pj5PsG3bJgoWLOjU2DyVJHrhnW7dMla8Szk3D9CnjxThCWEHWsOkSRr4\nPUnr03TqtIESJUKcFZbHkwVzhPeJj4eOHeHQIdNj9evD6687PiYhvMCuXbBvnwImA9HAUWA1ffs+\n4tzAPJwkeuF9Ro2CRYtM20uVMh7u9fV1fExCeIGHda++wHQgiipVsvPkk86LyRvI0L3wLnv3wrBh\npu1ZsxrJP1cux8ckhAfTWrN8+XIOH9bMmpX0iB+QXRafdABJ9MK7jBtnTBSmNH06PP644+MRwoNp\nrRk8eDCNGjXixRf/j6io5L97OXNCmzZOCs6LSKIX3uPmTZg927R94EBo2dLx8QjhwbTWvPvuu4we\nPRqA69fHAVOS9enSxVibStiXzNEL72E815O8LWdO+OAD58QjhIeKj4+nb9++fPvtt0laGwEPH2ct\nVQqGDnV4aF5JEr3wDqaLaxvat5dbCiFsZPduGDIknu3b3+TWrclJjrQAfgYCAMiWTUpiHEkSvfAO\nO3caW9GmJCvgCWETe/ZA1apx3Lv3P2BakiNtgJmAf2KLlMQ4lszRC+9gbj37KlXgiSccH4sQHubK\nFWjWLJZ7914neZLvAPxI0iQ/eDC0aOHY+LydJHrh+VIrwpO7eSEyLTYWWreO4fTpdkDS5+e6YKxn\n/3DguEEDGDHCwQEKSfTCC6RWhPfKK86JRwgPMmAAbNp0EliXpPVNjNXvHi4+1bChrEflLDJHLzxb\nakV4HTpIEZ4QGbRzJ4wfD4cPG6/j4uCPPwBKA2uBcKAjjzzyFStWKHLnNvrlzg158zolZIEkeuHp\n1qwxX4Qny3EJkSH//gt168KNG6n1qAT8hVLF+OUXRZUqDgxOpMnpQ/dKqXpKqX+UUseUUu+bOV5U\nKbVRKbVHKbVPKdXAGXEKN3TqlPH4XEovvCBFeEJk0GefJU3ydwAzm0JRnE8/VdSq5bi4RPqcmuiV\nUr7AeKA+UAF4VSlVIUW3D4BftNbPAG2BCY6NUrilyEho3twoB06pRw/HxyOEG7tzxyh1MdwC6gHV\ngP3J+r36KrzzjmNjE+lz9h19ZeCY1vpfrfV9YDbQNEUfDeRI+D4ncN6B8Ql3pLUxNL9nj+mx6tWh\nXTvHxySEG5s9G27fBrgB1AG2AVeAWsB1AJo0gcmTQSlnRSlS4+w5+sLAmSSvzwLPp+gzHFijlOoN\nZMX4lyXEQ1u2wA8/wKVLxus7d2DrVtN+RYrAL79I2a8QGWQsQ3EVI8n/mdjer99A2rR5hAIFoGhR\nSfKuytmJ3hKvAtO01l8opV4AZiqlKmqt45N2Ukp1A7oBFC1a1AlhCqdYssRYfSMuLu1+gYGwcCGE\nhDgmLiE8xJ49sGvXZYx7rIeFre+/P4HRo2UazB04e+j+HFAkyetHE9qS6gr8AqC1/g3IApg8qKG1\njtBah2qtQ/Ply2encIVLOXzYKLZLL8mDcUtSqZL9YxLCw3z11QUgjIdJXlGs2PeS5N2IsxP9LqC0\nUqqEUioAo9huSYo+pzEezkQpVR4j0V92aJTC9dy8CU2bPpg4TFufPvD66/aPSQgPc+TIOWbOrA4c\nTGjxAaYzeHBXJ0YlMsqpiV5rHQv0AlZjPKvxi9b6gFLqQ6VUk4Ru7wBvKKX2Ymx/1ElrrZ0TsXAJ\n8fHGnfyRI+n3bdsWPv/c/jEJ4WFOnz7Nyy9XR+sHv2e+wE9ky9aBtm2dGZnIKKfP0WutVwArUrQN\nTfL9QeAlR8clXNiIEbBsmWl7mTIwduzDiqASJaBsWakQEsJC+/bB1Klw8SLs2/c9ly8fTzjih/FQ\nVEteew2yZ3dikCLDnJ7ohciQxYvhww9N27NnNza4Ll/e8TEJ4QF+/dVY+S4q6kHLcIySqR+BeUBj\nQBaVdEfOnqMXwnKHDplf6Q5g5kxJ8kJY6cwZaNUqaZIHIz1EADt4kORDQ+HZZx0fn8gcSfTCPdy8\nCc2aGc/IpzR0qFGYJ4TIsHv3oGVLuHTpFBCf4qgv8EziqzffdGRkwlYk0QvXl1bxXaNGMGyY42MS\nwgNobawIvWvXHoxNabpjmuwNTZtC586OjE7YiszRC9eXWvFd2bLw44/gI59XhbDG+PEwbdoujBXv\nbgDfA49QqNCnfPLJw1+txx+HJ5+UulZ3JYleuLYtW9IuvsuZ0/ExCeEB/vwT/u//fsPYoOZWQmsu\n/Pxas2ABPJ9yMXLhtuRWSLi2r74y3/7jj1CunGNjEcKD9O27hbi4OjxM8nmADUya9JwkeQ8jiV64\nLq1h+3bT9mHDjK2yhBBWmTNnPVu31sfYVx4gH7CRHj2eoasseudxJNEL13X+vLFyR1IBATBokHPi\nEcIDrF69mvbtGwGRCS0FgE0ULvxEqgNowr1Joheu648/TNuefNJI9kKIDFu6dClNmjQhNvZeQkth\nYDNQga5d5VfLU0miF65r927TNtmBTgirREVF0aNHD+7fv5/QUgzYApTBxwcZsvdgkuiF6zJ3Ry+J\nXgirBAUFsWLFCvz9cwMlMe7kSwJQvz4ULerM6IQ9yeN1wjVpLYleCBvLm/dJ4uLWYxTfFU5sl/Xr\nPZvc0QvXlFohXsWKzolHCDd07dq1ZK+nTIH4+KdJmuQLF4YGDRwcmHAoSfTCNZmbn5dCPCEsNmnS\nJEqXLs2ePXsAiIuD77837de1K/jJ2K5Hkx+vcE0ybC+E1caNG0ffvn0BqF27Nlu2bOHEiQqcOpW8\nnxTheQe5oxeuSRK9EFb57LPPEpM8wGOPPYavb0GzO89JEZ53kDt64Xq0Nj90Hxrq+FiEcCMjR45k\nyJAhia9ffPFFlixZSYsWOTh71rS/FOF5B0n0wvWcOweXLiVvCwgwttASQpjQWjNs2DA++uijxLZq\n1aqxfPlyBg7MxpYtpudUrgwNGzowSOE0kuiF65EV8YSwmNaagQMH8sknnyS2hYeHs3jxYubOzcq3\n35qekzcvzJ0Lvr4ODFQ4jSR64Xpkfl4Ii2itefvtt/kqySL19erVY8GCBezfH2R2Xt7X10jyMjfv\nPaQYT7gemZ8XwiILFy5MluSbNGnCokWLuHUriObNITra9JwvvoCwMMfFKJxPEr1wLbIinhAWa968\nOT179gSgZcuWzJ07F6UCaTVJS4YAACAASURBVN3aKHVJ6fXXoU8fBwcpnE6G7oVrkUI8ISymlGLc\nuHE888wzdOzYET8/P3r3hl9/Ne1bqRJMmgRKOT5O4VyS6IVrkUI8IVIVGxsLgF+Spex8fHzomrDq\nzdSpmC2+y5cPFi6EoCCHhClcjAzdC9ci8/NCmBUTE0Pbtm3p0qULcXFxJsd37iTV4rtffoEiRRwQ\npHBJckcvXIvMzwthIjo6mldeeYUlS5YA4O/vz+TJk/HxMe7VLlyAFi0gcav5JL78UorvvJ0keuFa\n9u83bXv2WcfHIYSLiIqKomXLlqxcuTKxLUeOHKiEyfb790mz+K53b0dFKlyVDN0L13HnDpw5k7xN\nKShf3jnxCOFkkZGRNGnSJFmSHzBgAF9++WViou/XD7ZuNT1Xiu/EA5Lohes4fNi0rXhxqSASXunO\nnTs0aNCAdevWJbYNHTqU0aNHJyb5KVNgwgTTc6X4TiQlQ/fCdRw6ZNomd/PCC928eZMGDRqwffv2\nxLaRI0cyePDgxNe7d0OPHqbnPlj5TorvxAOS6IXrkEQvBNevX6du3brs2rUrse2zzz7j3XffTdbv\n//4v9eK76tXtHaVwJ5LoheuQRC8EHTp0SJbkv/76a/qkWM5u3z7Yts303I4dpfhOmJI5euE6JNEL\nweeff05ISAgAkyZNMknyABERpueVLy/Fd8I8uaMXriEmBo4fN22XRC+8TLly5Vi/fj179uyhQ4cO\nJscjI2HmTNPzeveGLFkcEKBwO5LohWs4dgwSlvdMlD8/PPKIc+IRwkHi4+MTF755oGLFilSsWNFs\n/zlz4Nat5G3BwfDaa/aKULg7GboXrkGG7YUXOnXqFJUqVWLHjh0Wn2Nu2P7VVyFHDhsGJjyKJHrh\nGiTRCy/z77//Uq1aNf766y/q1avHH+aWf05h3z4w95mge3c7BCg8hiR64Rok0QsvcuTIEapVq8bp\n06cBY5nbixcvpnueubv5p5+WfZ9E2mSOXrgGSfTCSxw8eJDw8HAuXLgAQJYsWVi0aBF169Y12//a\nNYiONr7MFeF17y6V9iJtkuiF88XHm1/+VhK98DD79u2jVq1aXL58GYDg4GCWLl1KzZo1TfoeOABt\n25rf5+mBrFmhXTt7RSs8hSR64XxnzhjPDCWVPTsUKuSceISwgz///JPatWtz7do1ALJly8aKFSuo\nWrWqSd8LF6BOHTh/Pu1rShGesITM0QvnS23YXsYjhYfYuXMn4eHhiUk+R44crFmzxmySv38fWrVK\nP8kDdOtm60iFJ5I7euF8Mj8vPNipU6eoVasWt2/fBiBXrlysXbuW0FQq6Pr2Nb+8bUp160oRnrCM\n3NEL55NELzxY0aJF6Z7w/FuePHlYs2YjFSuGcu8eJl8REcYytikFBkKBAsZXsWLQpQvMmiWDXsIy\nckcvnE8SvfBgSik+/fRTrl4NYvPmV6hc2fyKd6nx84O1a8HMKL8QFpFEL5xPEr3wcIcOKebO/ZA7\ndzJ+7ldfSZIXmSND98K5Ll+Gq1eTtwUEQIkSzolHiExaunQp7du3JzZh74YbN6BZM6xK8p07w1tv\n2ThA4XXkjl44l7m7+TJljPFKIdzMggULaNOmDbGxsWitmTZtBu3b+3L0aMavVbkyTJgg8/Ai8+T/\npsK5ZNheeIjZs2fTvn174uLiAPj9998ZMOAqy5eHmPT18QF/f/PXCQqC2rVh4kTZdlbYhiR64VyS\n6IUHmDFjBp07dyY+Ph6AsmXL8s476+nWzTTJZ88OO3dCuXKOjlJ4K0n0wrkk0Qs398MPP/DGG2+g\ntQagQoUKTJiwnkaNCpjt/+OPkuSFY0miF85z7Rps3mzaLoleuIkJEybQs2fPxNdPPvkk8+evo0GD\nfGaL74YPhyZNHBefECBV98KZZs40tuRKKnduqFDBOfEIkQFfffVVsiT/7LPPsnbtBvr2zWe2+K5p\nUxgyxIEBCpFAEr1wDq3hu+9M2zt2TL1KSQgXMXHiRPr165f4+vnnn2f9+vWMG5eHFStM+5crBzNm\nGEV4QjiaDN0L59i2zfz8vOzSIdxAnTp1yJevMJcvnyMg4CVOnVrB44/nMLsRTY4csGiR7DInnEc+\nXwrnMHc3X62aVCkJtxAc/BgxMeuBNty/v4oLF8wneTCK78qWdWh4QiQjd/TC8a5dg7lzTdsTNv4Q\nwtVNmgQ3bpQFZqfZb8QIaNzYMTEJkRqrEr1SqgJQDSgK5AWigEvAX8AWrfVtm0UoPM+MGaZFeHny\nQIsWzolHiDRorRk4cCANGjSgWrVqxMbC99+nf16zZvDBB/aPT4j0WJzolVKPAt2ALkDBB80pumkg\nTim1DpgILNMPHi4VAowivIgI0/aOHWUZMOFy4uPj6dWrFxMnTmT8+PGsWbOGy5dfSHWYHoyCuxYt\nYOpUKb4TriHdRK+Uyg0MB7oD/sBJYBawC7gAXAOCgDxAOeAFIAyoC/yjlHpHa73S9qELtxAVBcOG\nwYYNxl18bCwcPmza7403HB+bEGmIi4uje/fu/PDDDwDcuXOH77//ngsXXjDp26jRw33kc+aEbNkc\nGakQabPkjv4YEAh8D0zXWu9M7wSlVA6gLcYIwDKlVD+t9bhMRSrcT1ycMX65Zk3a/apXlyI84VJi\nY2Pp0qULM2fOTGxr164dgwd/R6lSpv179oTChR0YoBAZYMnA0kygpNa6pyVJHkBrfUtrHaG1DgVa\nYszfC28zaFD6SR7kkTrhUmJiYujQoUOyJN+pUydmzJjBtGl+pJyMLFYM6tRxcJBCZEC6d/Ra676Z\neQOt9aLMnC/c1Jw58Omn6fcrVEiK8ITLuH//Pq+++ioLFixIbOvWrRsTJ04kPt6HhFH8ZN54Q+bi\nhWvL8D9PpVTRhKH5tPpkV0oVtT4s4db27oUuXdLvV6gQzJ8vRXjCJURHR9OqVatkSb5Xr15MmjQJ\nHx8fli/HpAjP19eyf+pCOJM1j9edwCjO+yiNPn2ADwFfK64v3NnVq8a8fGSk6bEhQ6B1a+P7gAAo\nUwZUygc3hLCPFSuMx+KuXDE9pnUsBw404/r1VYlthQu/w19/fUb16sa/0ePHTc9r0gQKFjRtF8KV\nWJPoFaaP1Qlh6NEDTp40bW/Vylg9RBK7cIJZs+C119Lq4Qc8BzxI9IM4d24k586l/e9V1ngS7sBe\nK+MVAO7a6drCVf37r/kV7ypWNB4qliQvnODPP6FrV0t6jgCigWBgKOndzxQvDrVrZzY6IezPokSv\nlHo9RdPTZtrAGKovCrQH/s5kbMLdTJ5s2pYrl7GjhzxYLJzg8mVo3hzu3bOktwLGYOmA5VtvSRGe\ncA+W3tFPw1j1joQ/myZ8pfTgNyQS4+Ox8Bb378OUKabtAwfCY485Ph7h9WJjoU0bOH3a3NFrwKcY\npUQBSdotS/JNmkCSXWqFcGmWJvrOCX8qYAqwCFhspl8ccBX4TWt9w5ILK6XqAV9jjAZ8r7UeY6bP\nKxgFgBrYq7VuZ2HcwlGWLIFLKZZL8PeHTp2cEo7wfAsWwMKFcDuVnTUuXoQdO0zbH3/8CtHRtTl2\n7C+qVTvKsGGz8fPzt/h9ixUzvoRwFxYleq319AffK6U6Aou01jMy++ZKKV9gPFAbOAvsUkot0Vof\nTNKnNDAQeElrfV0pFZLZ9xV2YG79+ubNIUR+XML2Pv0UBgzI+Hm5cl0kJqYWx47tB+DXXxcSH7+Z\natVq2ThCIVxHhovxtNY1bPj+lYFjWut/AZRSszGmBA4m6fMGMF5rfT3h/WWVPVdz/DisXWvaLiXJ\nwg5WroT338/4eUqdJ3v2cI4cOZzwWjFlyhRq1ZIkLzyb1VX3SqlgoAXwDJALuAn8CSzUWltacV8Y\nOJPk9Vng+RR9yiS83zaM4f3hWutVCNdhbs/OUqWghi0/EwoBR49Cu3aYLEObvjPkyVOTM2eOAeDj\n48OMGTN4Le1n7oTwCNbuR98AmA7kJnn1igbGKqU6a62X2SA+MGIsjbEj3qPAFqXUEylrAJRS3TA2\n0aFoUVmUz2FSK8Lr1k0epxM2dfu2sRbTDYuqf5I6SY4cNbly5QQAfn5+zJo1i9YPFm8SwsNlONEr\npZ4FFmDcXf8EbAD+w9ijvibwKjBPKfWS1vqPdC53DiiS5PWjCW1JnQV+11rHACeUUkcwEv+upJ20\n1hFABEBoaGiGP+8LK6VWhNexo3PiER5j4UJjRigqynh98KDxlVKtWsbuceZcuHCcjz6qyfnzRum9\nv78/c+fOpWlTcw8NCeGZrLmjH4xx515Va52ypnWaUmo8sAkYhLFzXVp2AaWVUiUwEnxbIGVF/SKM\nDw9TlVJ5MYby/7UibmEP331n2taihRThiUwZMgRGjky/X8mSxv5JuXObHjty5Ag9e9bgfMIC9YGB\ngSxYsIAGDRrYOFohXJs1yz1UBeaaSfIAaK1/B+Yl9EuT1joW6AWsBg4Bv2itDyilPlRKNUnothq4\nqpQ6CGwE3tNaX7UibmFrx4/DunWm7bLtrMiEOXMsS/JZsxprMZlL8gBZsmQhICAg8fslS5ZIkhde\nyZo7+pwkL6Az5zSQ5g53D2itVwArUrQNTfK9Bt5O+BKuxNxKeFKEJzLB0o0PwVhV+YknUj9etGhR\nNmzYQIMGDZgwYQI15N+l8FLWJPrzGI/FpSUUY95eeKr7943/06YkRXjCSlevGksvmNv4MKUxYx5u\nhJiWEiVK8Pfff+PnZ69tPYRwfdb8618BvKmUeh/4TGsd9+CAUsoH6AfUAibZJkThkhYvlpXwhNW0\nhmXLYNs2iIkx2rZuhRMnTPs2amSUfQD4+UGlSlChgmm/nTt3cuvWLZPn4iXJC29nzW/AR0AzYBTQ\nXSn1K8bdewHgZaA4cAGwYJZNuC1zK+G1aAH58jk+FuF2+vWDr79Ov1/FivDzz+nvibRt2zbq169P\nbGwsK1asICwszCZxCuEJrFkZ74JS6iXgO4yla1Ou+rwWeFNrLUP3niq1IjxZCU9YICLCsiRv6caH\nmzZtolGjRty9a6zT1blzZ/7555/EQjwhvJ1VY1pa65NAXaVUYYyV8XJirIy3R2ud8jl44WnMFeGV\nLg1yFyXS8dtv0KtX+v2UMu7k09v4cN26dTRp0oSohIft8+fPz9KlSyXJC5FEpiavEpK6JHZvIkV4\nwkrnzxuzOw/m5FOjlHHHX69e2v1WrFhBixYtiI6OBqBgwYJs2LCBcuXK2ShiITyDNSvj/QJMBVZr\nreNtH5JwaeaK8AICZCU8kaboaGjVCi5cMD3WtClUTVh1IzAQXn4Znn467estXryY1q1bE5PwqaFI\nkSJs2LCBUqVK2ThyIdyfNXf0rTBWvLuklPoRmK613m/bsITLkiI8YYU+fYxh+5Sefx5mz4YsWSy/\n1rx583j11VeJjY0FoHjx4mzcuJHixYvbJlghPIw1K+NVwSjECwDeAfYqpXYrpXonLFErPNV//8lK\neCLDIiLMfz4sUADmz89Ykp81axZt27ZNTPKPPfYYW7ZskSQvRBoynOi11ju11m9hbGLzCsZz9U8C\nXwPnlFILlFLNlFLy8KqnMZfkpQhPpGH7dvPFd/7+MG8eFC6cseudOnWKuDhj6Y6yZcuyZcsWihQp\nks5ZQng3q5Ox1vo+xpr285RS+YD2QEeMZ+ybAlcB2dnEk5hL9E2bShGeMOv8eWjZ0nzx3bhx8NJL\nGb/mwIEDiY6OZt68eaxfv578+fNnPlAhPJwylpK30cWUUhgr440G/LTWvja7eAaEhobq3bt3O+Ot\nPZfW8Oijxv+9k1q1CurWdU5MwmVobdy9//03xCeU6E6fDjt3mvb93/+MoXxrPx9qrYmMjCRr1qzW\nByyEh1FK/aG1DjV3zCbD60qpshh38+2BwoACjtri2sJFHDpkmuQDAh6WSwuvpTX06GF+x+KUqlSB\nb7+1PMnPnz+fhg0bkiXJRL5SSpK8EBlgTTEeAEqpXEqpHkqpHcBB4H2MHet+wNirvqyNYhSuwNyw\n/UsvQXCw42MRLuWzzyxL8g+K7wIDLbvumDFjaNWqFa1ateL+/fuZC1IIL5bhRK+UaqyUmouxvv23\nGDvVrcO4my+gte6mtd5m2zCF05lL9Ck2DxHeZ80aGDgw/X7+/kaSL1Qo/b5aaz788EMGJlx4+fLl\njBo1KpORCuG9rBm6X5zw5xFgOjBDlr31cDExsGmTabskeq92/Di0bftwTj41SsHEifDii+lfU2vN\nBx98wMcff5zYVqNGDfr375/JaIXwXtYk+u8wFsnZYetghIvauRNu307eliuXsV+o8Apaw549cOTI\nw7aPP4br10371qsHJUoY32fPDo0bG6vdpf8emv79+/P5558nttWpU4eFCxcSLFNEQljNmt3retgj\nEOHCzA3b16wJvk55qEI4WFwcdO4MM2em3zc8HJYuNfaNzwitNf/3f//HuHHjEtsaNmzIvHnzkhXi\nCSEyTha1Eelbu9a0TYbtvcaQIZYl+eLFYc6cjCf5+Ph43nrrLb5LUtHXvHlzZs+eLbvQCWED6f5K\nKqU2ABroqLU+m/DaElprHZ6p6ITz3boFO8zM0tSu7fhYhMPNmwejR6ffLyjI2Ds+T56MXT8uLo43\n3niDqUl2RHzllVf48ccf8ff3z2C0QghzLPnsHYaR6IOTvLaE7VbiEc6zZYsxdptUsWLpbxQu3N7f\nf0OnTun38/Exdi5+6qmMv8f169f59ddfE1+3b9+eqVOn4pfRYQEhRKrS/W3SWvuk9Vp4uNSG7WXZ\nW4927Ro0awZ375oeCw+HvAnbV+XODa++av26SXnz5mXDhg1Ur16dsLAwJk+ejK/UfghhU/KxWaTN\nXCGeDNt7tLg4aNcO/v3X9Fjz5sZwvo8NP+4XKVKEHTt2kDdvXnxseWEhBJCJlfGEFzh/Hg4eNG2v\nWdPxsQiH+eADWL3atL1CBWP9+szk4nv37vGbmY3pQ0JCJMkLYSeZWQL3NaXUeqXUNaVUbMKf65RS\nr9kyQOFE5u7mn34a8uVzfCzCIebOhTFjTNtz5jSK7bJnt/7akZGRNG3alLCwMFab+yQhhLALa5bA\n9VdKLQZmADWA7MDlhD9rAjOUUouVUlIy6+5k2N6rpFZ8pxTMmgWlS1t/7bt379KoUSPWrFnD/fv3\nadasGcePH7f+gkIIi1kzRz8QaAzsSPh+q9Y6TinlC1TF2KK2ETAAGGmrQIWDaS3r23sQreHsWbhw\nwfzxmBjo0AEiI02PffQRNGhg/Xvfvn2bBg0asHXr1sS2QYMG8Zg8uSGEQ2R4P3ql1DEgHqiotTbZ\nUkopFQjsT7h2KZtEmUGyH70NHDgAFSsmbwsIMNY8leVI3Up0NLRpA4sXp983pRYtjOI7ax+yuHHj\nBvXr12dHkrUYxowZw4ABA6y7oBDCrLT2o7dmjv5RYLG5JA+gtY7G2PimsBXXFq7C3N38yy9LkndD\n06dbl+QrVIBp06xP8teuXaNWrVrJkvyXX34pSV4IB7Nm6P48kN78u39CP+GuZNlbj7FgQcbPyWzx\n3eXLl6lduzZ79+5NbPv222/p2bOndRcUQljNmjv6WUArpVQOcweVUrmAVsBPmQlMOJFsS+sxoqON\nxQ0zwscHfv7Z+uK7CxcuUKNGjcQkr5QiIiJCkrwQTmLNHf2HQEVgp1LqQ2ALcBHID1QHhgA7gY9s\nFaRwsN9/N10S7ZFH4NlnnROPsNr27RAVlbwtSxbT8osHChWCvn0zt1TCqlWrOHDgAAA+Pj5MmTKF\njh07Wn9BIUSmWJPoH/xvQwHm9rRSQGngnko+uae11rISnzswN2wv29K6JXOlFk2bwuzZ9nvPTp06\ncenSJQYNGsTMmTN59dVX7fdmQoh0WZN4f0U2rPFs8lidx3DWj7J///40adKEcuXK2f/NhBBpynCi\n11qH2SEO4Spu3TKG7lOSRO92rl8Hc0+Z2vpHefLkSUJCQghO8USGJHkhXIMsLi2S27TJdFva4sVl\nW1o3tHEjxMcnbytVyvhx2srhw4d56aWXaNq0Kffu3bPdhYUQNiOJXiSX2livbEvrduz9hOT+/fsJ\nCwvj/PnzrFu3jvbt29vu4kIIm0k30Sul3lVKZbH2DZRSzyil6lt7vnAweX7eY9hzq4K9e/dSo0YN\nLl68CEDWrFnp1auXbS4uhLApS+7oRwHHlVIDlFKFLLmoMtRVSi0EdgNPZSZI4SCbN8Phw6bt4eGO\nj0VkysmTcOxY8jaloEaNzF/7jz/+oEaNGly5cgWA7Nmzs3r1asLCwjJ/cSGEzVlSjPcE8CXGZjUj\nlVLbga0YCfw/4DqQBcgDlAOqAOFAAeAq0Av4zuaRC9s6cwZatzZtf+YZyJvX8fGITDF3Nx8aaiyH\nkBk7duygXr163Lx5E4CcOXOyevVqnn/++cxdWAhhN+kmeq31EaCRUupFoCfQEmOXOnOP2D2YyP0H\n+ASYqrW+baNYhb1ERRm7l1y+bHqsQwfHxyMyzR7D9lu3bqVBgwbcvm38SufOnZs1a9ZQqVKlzF1Y\nCGFXFj9ep7XeDmxXSr0JVANeBopi3MlHAZeAfcAmrfUBO8Qq7EFr6NHD/HNYL78Msmyp24mPh/Xr\nTdszU2qxadMmGjVqxN2EFRPz5s3LunXreOopmZUTwtVZ8xz9bWB5wpdwd99+a2xvllLhwjB3rrE1\nrXAre/dCwvR5oqAgePFF664XHx/Pe++9l5jk8+fPz/r163n88cczGakQwhHk8TpvduEC9O9v2h4Q\nAPPnQ4ECjo9JZNqUKaZt1apBYKB11/Px8WHJkiWUKVOGQoUKsXnzZknyQrgRi+7olVKvA39prffZ\nOR7hSAsXgrlFTiZNAimucks//2wM0qSU2SckCxYsyIYNG4iKiqJUqVKZu5gQwqEsvaOfBjRL2qCU\n6qiU2mDziITjmKvY6tIFOnd2fCwi0/buha5dTdv9/Mw/UJGW//77z6StcOHCkuSFcEOZGbovjrEt\nrXBHcXGwwczntC5dHB+LyLSrV6FZM9MtaQE++giKFbP8Wj/++CMlS5Zk6dKltgtQCOE0sm2st/rj\nD7hxI3lbtmxQubJz4hGJIiPh7bdh2TJIqH9LV3S0+STfujUMGGD5e0+dOpWuXbuitaZVq1asXLmS\nmpnZnF4I4XSS6L2VuWH7sDDw93d4KOKhuDho1QpWrsz8tZ54wijMs3Sbgu+++44333wz8XXZsmWp\nWLFi5gMRQjiVVN17K9lz3iUNHWqbJP/II0atZbZslvX/5ptvkiX5p59+mg0bNhASEpL5YIQQTpWR\nRG9uJTzhjiIjYds203Zb7XgirDJ/Pnz8ceav4+NjVN9burPwF198QZ8+fRJfP/fcc2zYsIG8svSx\nEB4hI4l+uFIq7sEXMBQgaVuKr1j7hCwy7ddf4f795G0FC0L58s6JR7B/P3TsmPnr5M9vJPm6dS3r\n//HHH/Puu+8mvn7hhRdYu3Ytj2R2UXwhhMvIyBx9Rjcklw3MXZXsOe9UkZEwZAisXm18D8ZKduYK\n74YNg759LbuuUpAzp2U/Rq01I0aMYMSIEYlt1apVY9myZWTPnt2yNxRCuAWLEr3WWubyPYm5Pedl\n2N5hevaEadPS79esmTFn72OH375du3YlS/I1a9ZkyZIlZM2a1fZvJoRwKkng3ubSJWNllZRkz3mH\nOHHCsiRfvjzMmGGfJA9QuXJlxo0bB0C9evVYtmyZJHkhPJQ8XudtzC2SU6ECFCrk+Fi80OTJ6ffJ\nmRMWLQJ7j6D37t2bRx99lAYNGhBo7UL4QgiXl+FEr5R6CmgHVAbyYVTjXwZ+B2Zprf+2aYTCtmTY\n3mliYsxvOJPUo4/C7NlQpoxt3zs+Pp6oqCiTu/bmzZvb9o2EEC7H4kSvlPIFvgG6YRTapSz5qQ68\np5SaAPTVWsvjeK5Ga/OJXp6fd4glS+DixeRt/v7w++/GXby/v5HobV0TGRcXR9euXTl+/DirVq2S\nIXohvExG7ug/B94E7gO/AJuAcxgJvxBQE2gF9ATuAWb2PxVO9eefcOZM8jZfX6guWxY4QkSEaVuz\nZvDMM/Z7z9jYWDp27MisWbMAaNy4McuXLycoKMh+byqEcCmWblNbCugNnALqaa3/MdNtilJqJLAK\n6KeUmqS1/td2oYpMuX4d2rQxba9Sxf6TwYJ//4U1a0zbu3e333vGxMTQrl075s2bl9hWsmRJAgIC\n7PemQgiXY2lNbweMO/dOqSR5ALTWh4GOgC/QPvPhCZuIi4N27eD4cdNjMkfrEN9/b9r22GNQo4Z9\n3i86OprWrVsnS/I9evQgIiICX19f+7ypEMIlWZroXwQOaa03p9cxoc9B4OXMBCZsaMgQWLXKtL18\neUiyvrmwj9SK8Lp1s8/jc/fu3aNFixYsXrw4sa1v376MHz8eH3s9ryeEcFmW/taXw6iqt9TvCecI\nZ5s3D0aPNm1/8AyXFGbZXWpFeJ062f69IiMjady4MStWrEhs69+/P2PHjkXJyodCeCVLE30u4FIG\nrnsRkMWyne3cOejc2bRdKfjpJ9s/wyXMMleE17w52HpjuDt37tCwYUPWJVnieMiQIYwZM0aSvBBe\nzNKq+6xAVAauGw0EZzwcYVPjx8OdO6btH30EDRs6Ph4vdP26+a0FunWz/Xu98847bNq0KfH1Rx99\nxAcffGD7NxJCuBWZsPNUqU0Mt2gBgwY5Ph4vtWEDxMcnbyte3D5FeCNHjuTxxx8H4NNPP5UkL4QA\nMvYcfTOlVHEL+9rxyWBhkdQmhidOlF3qHMjc3Xz9+vYpwsuXLx/r169n5cqVdLJHAYAQwi1lJNE/\nnfBlKVkZz5kcNTEs0mQu0dtqxeHY2Fj8/JL/CufPn1+SvBAiGUsTvZmKLuGyUludxR4TwyJVJ0/C\nsWPJ23x8ICws89e+cOECdevW5YMPPqB169aZv6AQwmNZuh/9dHsHImzI0auzCLPM3c2HhsIjmXwe\n5dy5c9SsWZMjR47QyKGyQwAAIABJREFUrl07/P39adasWeYuKoTwWE4vxlNK1VNK/aOUOqaUej+N\nfi2VUlopFerI+NyOo1dnEamyx0aBp0+fpnr16hw5cgQArTX37t3L3EWFEB7N4v/zK6XeUkoNVEr5\np9EnIKFPDwuv6QuMB+oDFYBXlVIVzPTLDvQlY4v2eCdHrs4iUhUfD+vXm7ZnZqPAEydOUL16dY4n\nLGXs5+fHnDlzaNu2rfUXFUJ4PEs3tXkRY4vaj7TWMan101rfV0oFAN8qpf7UWqeXmCsDxx5sfqOU\nmg00xVhCN6mPgE+A9yyJ16ucPAkzZsB//xmvkzxHnUiK8Bxu7164ejV5W3AwvPCCddc7evQoNWvW\n5OzZswAEBAQwb948GjdunMlIhRCeztJivI7AHYytatPzOdAP6EL6d+CFgaT7pp4Fnk/aQSn1LFBE\na71cKSWJPqmzZ409Tm/cSLufFOE5nLlh+2rVIDAw49c6dOgQ4eHh/JfwYS4wMJBFixZRr169TEYp\nhPAGlib6qsB6rbWZZdaS01rfVUqtTzgnU5RSPsCXQCcL+nYDugEULVo0s2/tHsaPTz/JSxGeU5gr\nxLNm2H7//v2Eh4dz6ZKxAnVQUBBLly4lPDw8kxEKIbyFpXP0RYGjGbjusYRz0nMOKJLk9aMJbQ9k\nByoCm5RSJ4EqwBJzBXla6witdajWOjRfvnwZCNWNmXuELqXu3aUIz8Hu3YNffzVtz2iiv3nzZrIk\nnzVrVlauXClJXgiRIZZmAF8ytgCOtvDau4DSSqkSCXP7bYEliRfR+qbWOq/WurjWujiwA2iitd6d\ngVg805UrsGdP2n1efBH69HFMPCLRtm1Gsk8qJASeeCJj18mZMycjR44EIHv27KxZs4bq1avbKEoh\nhLewdOj+MvBYBq77GHAlvU5a61ilVC9gNcaHiSla6wNKqQ+B3VrrJWlfwYtt2AA6xWevYsVgwADj\n+7JloWpVo+Je2FRcHMyfD/v2ma5jD7Bjh2lbeLh1AytvvPEGAE899RSVK1fO+AWEEF7P0kS/C6it\nlMqptb6ZVkelVE6gNmBmltKU1noFsCJF29BU+oZZFK03MDcJ3Lgx9LDoyUaRCa+/DrNmZewcS4ft\ntdYmW8o+SPZCCGENS+8xfgZyYDzznp5vMebWf7Y2KJEOrc2XdWfmIW1hkYMHM57kwbIfzfr16wkP\nD+fWrVsZfwMhhEiFpYl+PrAdY0GbzUqpWglz6kDiQjm1lFKbgHbANq31fNuHKwBjLfuTJ5O3+fra\nZhF1kabVqzN+ToUKkN6DIKtXr6ZRo0Zs3LiR+vXrc+dOug+4CCGERSxd614rpVpizKVXTfgzVin1\nYEmQPAnXUsBeoJUdYhUPmBu2r1wZcuZ0fCxextxASlr8/GDUqLT7LFu2jJYtW3L//n0ATp06xaVL\nl8iWLZuVUQohxEMWb1Ortb6olHoBeBt4A+PxuQJJupwGIoCxWusom0YpkrPVQ9oiQ+7fh82bTdvf\nfhvy5DFtDw42ivDSqrZfuHAhbdq0ISbGWHCyaNGibNiwgZIlS9ooaiGEt8vIfvQkJPBRwCil1KNA\nwYRD/2mtz9o6OGFGXJztF1EXFtmxAyIjk7flzg2ffmrMnGTUnDlzeO2114iLi4P/b+/O46Oq7v+P\nvz4JS0AQUChFKeACCGpxQYt+rUAAQUQQpQUUZNPw00Jb0baiDxXxW1vFilpRUUQWAQUEBATZF/cK\n8vtRZZNFBVdEBEG2kPP74w7JJDOTTDL75P18PPJI5tw7Nx+vIe+ce889BzjzzDNZvnw5DRs2jEK1\nIiKeUgW9P1+wK9zjbd062Lu3cNtJJ0GrVomppxwJdtm+XbuyhfzkyZPp378/eb7n85o0acKyZcuo\nX79+hFWKiBQW1mA8M7vSzMKeV9bMfm1mN5e9LAkp2GX71q2hUqXAdomqaN0xGT9+PP369csP+ebN\nm7Ny5UqFvIjERLij7ldQZL55M/ub32C8oroDL0VQl4Six+oSYt8++M9/AttLe+pnzpzJoEGDcL7J\njs4//3xWrFhBvXr1SniniEjZhBv0FqQtC6gZxVqkJD//DG+/HdjeoUP8aylnVqwInAXvjDOgtGPm\n2rVrx4UXXgjARRddxIoVK/iFlhAWkRjSaiep5J13vKHf/urWhXPPTUw95Uiwy/Zl+fuqVq1aLF68\nmP79+7Ns2TJODTZcX0Qkiso8GE8SYN68wLb27cGCXXCRaIrmE421a9fmpZd0Z0tE4kM9+lRx6BC8\n/HJguy7bx9zOnbB5c+E2M8jOLv59zjkeeOABJk2aFLviRERKoB59qpg5M/Cxuqws6No1MfWUI8F6\n8xddFHySnBOccwwfPpxHHnmEjIwMKlWqRK9evWJXpIhICKXp0ZdmPXqJtuefD2zr2RNq1Yp/LeVM\naS/bO+cYNmwYjzzyCAB5eXlMnTo1f6S9iEg8laZHP8LMRhRtNLPj0StHgtqwIfho+5yc+NdSzuTl\nlS7o8/LyGDp0KM8880x+W9euXZk+fXrA8rMiIvFQmqAv7W8pdV+iJVhv/rzz4LLL4l9LOfPRR/Dd\nd4XbsrLgiisC983Ly2Pw4MGMGzcuv+2GG25g6tSpVNKERiKSIOGuXqdBe4ly6BAEG8yVk6PR9nHg\nl9n5rrjCC3t/x48fZ+DAgYUG3vXu3ZtJkyZRoYKGwohI4ijAk12oQXh9+yamnnLkp59gypTA9p49\nC7/Ozc2lb9++hUK+X79+TJ48WSEvIgmnoE92oQbh1dSkhLH2yitw4EDhturVwX/wvHOOPn36MG3a\ntPy2W265hfHjx5NZltVuRESiTEGfzD75JPggvMGD419LOTR2bGDbTTdBtWoFr82MLl265A+0u/32\n2xk7diwZGfqnJSLJQdcVk9kLLwS2nXeelqSNg7VrvY+igj3o0KdPH44dO8bHH3/MY489ptH1IpJU\nFPTJ6tAhmDgxsH3wYA3Ci4Ngd0wuuQR869EEGDBgQGwLEhEpI11fTFYzZ8KPPxZuq1IF+vRJTD3l\nyE8/wdSpge05OXDgwAH+8Ic/8MMPP8S/MBGRMlDQJ6tgN4g1CC8upk0LPgivc+f9dOrUiWeeeYaO\nHTuyb9++xBQoIlIKCvpk9Mkn3pK0RWkmvLgIdtm+R4+9dO/egXd8/1/WrFnD/Pnz41yZiEjp6R59\nMgo1E54G4cVc8EF4e3jvvQ5s2rQuv+XJJ5/kpptuimttIiJloaBPNqFmwtMgvLgI/BvrO6pUac+m\nTf/Nb3nuuecYrEccRSRFKOiTzYwZGoSXIIGD8L4G2nHo0EbAe2Z+3LhxDBw4MBHliYiUiYI+2Wgm\nvIQpPAjvSyAb2AJARkYGEydOpI/+4BKRFKOgTyYahJdQBQ867ATaANsByMzMZMqUKfQsOsm9iEgK\n0Kj7ZPLSS4Ft55+vQXhxsHattyStpwZQB4AKFSoyY8YMhbyIpCwFfTIJNq+9lqONi8LTFpwMvEm1\napcze/YsunfvnqCqREQip0v3ycI52LgxsP2aa+JfSzkTfCa8mowe/TZduuiPLBFJbQr6ZPH117B/\nf+G2KlWgYcPE1JPi8vK8JxXD8Y9//JeDB9cC/fPbvOVoFfIikvoU9MkiWG++aVPQcqelkpcHDzwA\n//43hDdD7TqgA7DH97o/4D3N6L8crYhIqlKKJItgQd+sWfzrSHEPPgj/+7/hhvx/8B6hOxHyw4C9\ngB50EJH0oaBPFgr6iM2ZAyNHhrv3u0B74MTkRDWBxUAtLrkELrggBgWKiCSAgj5ZKOgjsnEj9O0b\n7t6rgauAn3yvTwVWAC0BGD482tWJiCSO7tEnCwV9me3bB9ddF7i0LEBWVuFhDsePL+PIkWuBEyP1\nfkFW1jIyMs6jYUO47TbQ03Qikk7Uo08GP/4I33xTuC0zExo3Tkw9KSQvzxs4t2VL4LZrr4WDBws+\nXnvtTcy6cCLk69Wrx8aNqzh06DwOHoQNG2Do0PjWLyISawr6ZBCsN3/WWVCpUvxrSTEjRkCwZeGb\nNoXJkwt68/PmzaNbt24cPnwYgPr167Nq1SrOOeec+BUrIpIACvpkoMv2ZTJ7Njz0UGB79erewLwa\nNQrajhw5wvHjxwFo1KgRq1evprGumIhIOaCgTwYK+lLbsAFuvjn4tpdfhqId9R49ejBp0iSaNGnC\nqlWrOOOMM2JfpIhIElDQJwMFfan8+GPowXcPPABduwZ/34033sj69etp0KBBbAsUEUkiCvpkoKAP\n24nBd59+Grita1e4/37v6+nTp7N79+6AfSpXrhzjCkVEkouCPtEOHYIdOwLbNUgsqHHj4I03Atv9\nB9+NGTOGnj170r59e/bs2RO4s4hIOaKgT7QtW7yV6/zVr++NKJNCnIMnnghsPzH47uSTYfTo0QwZ\nMgSA9evX89e//jXOVYqIJBcFfaLpsn3Y3n47+Ok6MfjukUceYdiwYfntrVq14l//+lccKxQRST4K\n+kRT0Ift+ecD21q39u7NP/TQQ9x999357VdccQWLFi2iZs2acaxQRCT5KOgTTUEflj17YMaMwPac\nHMd9993H/SdG4QFt27Zl4cKFnHzyyXGsUEQkOWmu+0RT0Idl8mQ4cqRw2ymnONau/RuPPz4qv61D\nhw7MmTOHqlWrxrlCEZHkpB59IuXmBp+kXUFfiHMwdmxAK40a3VEo5Dt37szcuXMV8iIifhT0ibRj\nBxw9WrjtlFOgTp3E1JOk3n4bNm0q2nqEjIx1+a+6devGrFmzyMrKimttIiLJTpfuEynUZXuz+NeS\nxAJ789CmTRZz586nY8eO1K9fnylTplCxYsX4FycikuQU9Imk+/MBnIMXX4TXXoP9+722Dz8M3C8n\nB6pXr86iRYuoUqUKFSroR1lEJBj9dkwkBX2ABx4IviIdHAOWAldz6qlw/fVea3VNLCQiUizdo0+k\ndesC28px0M+aFSrkjwK9gM7Ac/TvD5qyXkQkPAr6RPn2W1i/PrD9ggviX0sS+OSTUMvOHgF6ALN8\nr2+jZct341aXiEiq06X7RFm+PLDt3HOhXr3415Jge/d6y84ePFh0yyGgO7Aov6VDh2H07HlZHKsT\nEUltCvpEWbo0sK19+/jXEWW5ufD00/DWW4FPDoayfTts3Vq09SC1a3fl++8L/iC6667hPPro3zE9\nlSAiEjYFfSI4B0uWBLZ36BD/WqIoLw969vTutUfmJ6pUuYbvv38rv2XEiBHcf//9CnkRkVJS0CfC\np5/Czp2F2ypUgCuvTEw9UfLQQ9EI+X1kZl7NoUPv5bc8/PDDDB8+PNIDi4iUSwr6RAh22b5Vq5Re\ng37uXBgxItKj7AU6cvx4wYPzjz32GHfeeWekBxYRKbcU9ImQZvfnN22CPn0iP05GxkfAOvLyvNdP\nPfUUQ4cOjfzAIiLlmII+3nJzg4+4T9H78/v2eSPmf/opcNuQIXDVVeEdJzMTLrigHe+99wq9e/fm\n6aefJicnJ7rFioiUQwr6eFu71ktHf9WrwyWXJKaeCOTlQd++sHlz4LZrroEnn4SMUs7UcMMNN/Dp\np5/SsGHD6BQpIlLOacKceAt22b5NG0jBBVlGjoR58wLbGzeGl18uOeR37tzJ119/HdCukBcRiR4F\nfbwFe6wuBe/Pv/46PPhgYHu1ajBnDtSsWfz7P/vsM1q3bk27du347rvvYlOkiIgo6OPq4EF4N8j0\nrSl2f37TJu+SfTCTJ0Pz5sW/f9u2bbRu3ZodO3awceNGOnfuTN6JEXgiIhJVCvp4eustOHascNtp\np8E55ySmnjLYtw+6dQs++O6++7yBecXZvHkzV155JV988QUAlStXZuTIkWSU9ma+iIiEJeG/Xc2s\nk5ltNrOtZnZ3kO3DzGyDma03s2Vmlro3cEM9Vpcis72dGHy3ZUvgti5dSn6OfsOGDbRu3ZqvvvoK\ngKysLObOnUvnzp2jX6yIiAAJDnozywTGAFcDzYHeZlb0wu86oKVz7tfATODR+FYZRSk+7e2DDwYf\nfNekScmD79avX0+bNm349ttvAahatSoLFizgqnCfvxMRkTJJdI/+UmCrc267c+4o8ArQzX8H59wK\n59zPvpfvA/XjXGN0hFqWNjs7/rWUweuve6Psi6pe3Rt8V6NG6Pd+9NFHtG3blt27dwNQrVo1Fi1a\nRNu2bWNUrYiInJDooD8d8J/0fZevLZRBwMKYVhQroZalPe20+NdSSiUNvmvWLPR7P/jgA7Kzs/nh\nhx8AqFGjBkuWLOGKK66IQaUiIlJUykyYY2Z9gJZA6xDbc4AcgAYNGsSxsjClyGN1x4/D7NmwYYO3\nyB7AlCnBB9/df783MK84o0aNYp9vgqBatWqxZMkSLr744ihXLSIioSQ66L8EfuX3ur6vrRAzaw/c\nC7R2zh0JdiDn3PPA8wAtW7Z00S81As4FH4iXhPfne/aE114reb9rr4UHHih5v0mTJrFnzx4+/vhj\nli5dSosWLSIvUkREwpbooP8QaGxmZ+AFfC/gRv8dzOxCYCzQyTmXmjOrpMiytGvXhhfyTZt6l+zD\neSKuatWqzJs3jy+//JKmTZtGXqSIiJRKQu/RO+dygSHAImAjMN0594mZjTSzrr7dRgHVgBlm9n/N\nbG6Cyi27YJftk3BZ2oVhjH4oafDd9u3bA9qqVaumkBcRSZBED8bDObfAOdfEOXeWc+7vvrb7nXNz\nfV+3d87Vdc5d4PvoWvwRk1CKXLYPVqa/rCyYOjX0/D6vv/46zZo1Y/To0dEvTkREyiTRl+7TX24u\nrFgR2J5kA/FCzc47bJg3f/3JJ3sr0oUK+RkzZnDjjTeSm5vLsGHDqFWrFv37949pzSIiUjIFfayt\nWZMSy9KGmp33scdKnrhv6tSp9O3bN3+++rPOOovsFJkfQEQk3SX80n3aC3Y9vG3bpFuWNtSkfSWF\n/MSJE+nTp09+yDdt2pTVq1cn5yOOIiLlkII+1kLNb59kylLmCy+8wIABA3C+B+7PPfdcVq1axWkp\nMAmQiEh5oaCPpQMHgt/4TrKgDzU7b7t2od8zZswYcnJy8kO+RYsWrFixgrp168aoShERKQsFfSwF\nu/F9+ulJtyztsmWBbeedB/XqBd//8ccfZ8iQIfmvW7ZsyfLly6lTp06MKhQRkbJS0MdSiixLW5rL\n9p999hn33HNP/utWrVqxdOlSTjnllBhVJyIikVDQx4pzMH9+YHuSPT8fanbeUEHfqFEjZs2aRcWK\nFfntb3/L4sWLqVHc0nUiIpJQerwuVt5+G7ZsCWxPssfOtmwJPjtv66BLB3k6d+7MkiVLaNmyJSed\ndFJsCxQRkYioRx8rY8cGtrVpE/rGd4IE681fdpk3SQ6Acy5/iVl/rVu3VsiLiKQABX0s7NkDM2cG\ntg8eHP9aSlDcZXvnHH/+85+59NJL+eqrr+JbmIiIRIWCPhYmTYIjRVbTrV0bundPTD0hFDc7b15e\nHrfddhtPPfUU27ZtIzs7m++//z7+RYqISER0jz7anIPnnw9s798fKleOeznFGT06+Oy8F198nFtu\nuZWXXnopv71FixYadCcikoLUo4+2t96CTZsC22+9Nf61FGPJErj77sD21q1zueWW/oVCvk+fPkyZ\nMoWKSTZtr4iIlEw9+mgL1ptv0waaNIl7KaFs3w49e4Jveno/x/jppz7Mnz89v2XAgAG88MILZGZm\nxrVGERGJDvXooykFBuEdPAjXXQd79xbdcpSzz+7JqlUFIT948GDGjRunkBcRSWEK+mhK8kF4zsGg\nQfDf/xbdcphTT72erVtn57cMHTqUZ599lowM/YiIiKQy/RaPpgkTAtuSaBDeqFHw6quB7TVqjGLP\nnjfyX9911108+eSTWJJN1SsiIqWnoI+Wn38O1lVOmkF4ixfD8OGB7VWqwKJFf6Fjx44A3HPPPTz6\n6KMKeRGRNKHBeNGyebN3bdxfw4ZJMQhv2zbo1SvY4DsYNw5+85ssZs+ezYwZM+jbt69CXkQkjSjo\no2XjxsC2Zs3iXwewfz98/bX3dW4u9O5ddPDdYSCLO++EG2/0WqpUqcLNN98c50pFRCTWFPTRkgRB\nf/w43HEHjBkTvPfu+QG4irPOup5//vOeUDuJiEia0D36aEmCoL/vPvj3v4sL+d1ANrCWbdvu5Ykn\nHotfcSIikhDq0UdLgoN+5kz4xz+K2+MboD3wCQBmRs2aNeNQmYiIJJKCPhpyc+HTTwPb4xT0H3/s\nPcUX2ld4PfnNAGRkZDB+/Hj69esX++JERCShFPTRsG0bHDtWuK1OHTj11Kgc/siRwMVnTjgx093B\ng4HbGjUCs53s2pXNsWNbAcjMzGTy5Mn07t07KrWJiEhyU9BHQ4wu2+flwe23w+TJ3mP6pdG9O4wa\ntYP27bM5duwzACpUqMC0adPo0aNHxLWJiEhqUNBHQ4yCfvRoGDu29O9r3hzuv38rbdtms3PnTgAq\nVqzIjBkz6NatW8R1iYhI6lDQR0MMgv74cXjiidK/r0YNmDkzl65dr84P+cqVKzN79myuvvrqiGoS\nEZHUo8froiEGQb9wIezaVbr3mMHUqdCsWQWeffZZKleuTJUqVZg/f75CXkSknFKPPlLOwaZNge0R\nBn2wZe2rVIFq1YLv36iR9xx9587e6/bt2zNnzhyysrJo06ZNRLWIiEjqUtBHatcuOHCgcFu1alC/\nfpkPuXMnvPFGYPvMmQVBXlReXl7AkrKdOnUqcw0iIpIedOk+UsEu259zjncdvYzGjw+c3a5BA/At\nMBfg/fff58ILL+Tzzz8v8/cUEZH0pKCPVJTvz+fmeivKFXXLLZCZGdj+1ltv0aFDB9avX092dja7\nSntjX0RE0pqCPlJRDvo33wwchJeZCQMHBu67fPlyOnXqxAHfrYP9+/ezt/AydSIiUs4p6CMV5aAP\n9tx8ly5w+umF2xYvXsw111zDz76ZdOrWrcvKlSs5//zzy/y9RUQk/SjoIxXFoN+5ExYsCGwfPLjw\n6zfeeINrr72Ww4cPA3DaaaexatUqzj333DJ9XxERSV8K+kjs2QO7dxduq1gRzjqrTId78cXgg/Cu\nuqrg9ezZs+nevTtHjx71bW/A6tWradq0aZm+p4iIpDcFfSSC9eYbN4YKpX9qMTfXC/qibr21YBDe\n9OnT+d3vfscx3wI6Z5xxBqtXr+asMv5hISIi6U9BH4koXrYPNhNeZiYMGOB9vXLlSnr37s3x48cB\naNy4MatXr6Zhw4Zl+n4iIlI+KOgjEcWgDzYT3rXXFgzCu/zyy+nSpYvvWzRj1apV1I9gUh4RESkf\nFPSl8f33cPPNcPbZ0LBh8CHyZQj6UIPwcnIKvq5UqRLTp0/nT3/6EytXrqRevXql/j4iIlL+aArc\n0ujVC5YtK36fMgR9sEF4DRsWHoQH3ip0T5RlSTsRESm31KMP1/fflxzyZlDK0e+hZsJr1uxfjBhx\nX6mOJSIiUpR69OHaurXkfa68EqpWLdVhFy6EL78s3Gb2MG++eS9vvuldsr/vPgW+iIiUjXr04dq+\nvfjtzZrBs8+W+rCFb/M7YATO3ZvfsnTp0vxn5kVEREpLQR+uYEE/aBB89hl88w1s2FDq+/NffOH1\n6D0OuBd4MH97dnY2CxYsoFKlSmUsWkREyjtdug9XsKBv0cIbNVdGBYPwHHAX8Hj+to4dOzJ79myq\nVKlS5uOLiIioRx+uYEF/5pllPlzBTHh5wB/xD/kuXbowZ84chbyIiERMPfpwbdsW2BZB0C9YAF9+\nmQfcBhTMltO58/W89to0Xa4XEZGoUI8+HIcPBw6NB2jUqMyH9GbC+xP+IX/66b2YM+cVhbyIiESN\ngj4cn38OzhVuO+00KOOl9YJBeL8HTjyO15exYydTsWLFCAoVEREpTJfuwxHl+/MFg/B+C8wDXqNB\ng6fo1CmzzMcUEREJRkEfjmBBX8alYQNnwssGssnJKViOVkREJFp06T4cUejRHz58mIEDB/Lii1v5\n6qvC2/yXoxUREYkm9ejDEWHQ//zzz1x33XUsWbKEadOWAquBRvnbu3b1bvmLiIhEm4I+HGE+Wucc\nTJjgrX1z+LDXlpt7gHffvZbdu1cCcPjwTmA2cEf++/yXoxUREYkmBX1JnAu7R/+HPxSd7n4/0Bl4\nx69tJP4hH2w5WhERkWhR0Jdk9244eLBwW9WqULduoaaxY4uG/I9AJ+ADv7ZHgL8Wet+tt0KGRkqI\niEiMKOhLEqo3b5b/8p13YOhQ/x32AFcBH/m1jQb+XOgwlSrBwIHRK1VERKQoBX1JSrhs/9VX0KMH\nHDt2ouU7oAOw3u8Nz+BNdVsgIwOeew7q1YtuuSIiIv4U9CUpJuiPHIEbbvBWqfXsAdoCGwAwMwYP\nfoF27QYVenuFCvA//wN16sSsahEREUBBX7IQQe8cDBkC77/vv6EGcC6wgYyMDCZMmEDfvn3jU6eI\niEgQCvqShHi0buzYojPcAVSgbt0pXHIJ3HTT9fTq1SseFYqIiISkoC9JkB79R/vO4o9/DNy1YkWY\nNasil132KuY3WE9ERCRR9GBXcUIsT9v9jka+wXefAn8HvJXtnn4aLr8chbyIiCQN9eiLE2R52u8q\nns4X32UBG/EWpPkGOMitt/6dnBwFvIiIJBf16IsT5LL9pmNnAv8FWuOFPGRkPMEdd+yIa2kiIiLh\nUNAXJ0jQr6AG3iN0uwEwO4np0xfSrFnZ16cXERGJFV26L06REfcfAv9kKeBbsYaTee65hdxww+Xx\nrkxERCQsCe/Rm1knM9tsZlvN7O4g2yub2au+7R+YWaO4FefXo38PaA8czg/5mgwfvoScHIW8iIgk\nr4QGvZllAmOAq4HmQG8za15kt0HAXufc2XgTxj8Sr/rytnlBvxpv5vr9+VtO4frrl/Hww5fGqxQR\nEZEySXSP/lJgq3Nuu3PuKPAK0K3IPt2Aib6vZwLtLA7Pr7k8x9FN21mO91fIAV+7cSotWqxk2rSL\nYl2CiIhIxBJmlPJhAAAJKElEQVQd9KcDO/1e7/K1Bd3HOZcL7ANOjXVhE0btJiv3INWATF9bXaB2\n7VUsXHg+lSrFugIREZHIJTroo8bMcsxsjZmt2b17d0THOnoUFo7xLttfCrwJnAOMozFz552rFedE\nRCRlJDrovwR+5fe6vq8t6D5mVgFv5Zg9RQ/knHveOdfSOdeyToTLwlWqBC/eWzAQ73LgY+C8C5rT\nqlVEhxYREYmrRD9e9yHQ2MzOwAv0XsCNRfaZC/TDG/jeA1juXJHp6mKgerdsjv/ideb/ezufr9hO\nmwbb+XVvjbAXEZHUktCgd87lmtkQYBHerfDxzrlPzGwksMY5Nxd4EZhsZluBH/D+GIi9X/6SzO5d\n6dYdFi6Ec9oBui8vIiIpxuLQOY67li1bujVr1iS6DBERkbgws7XOuZbBtiX6Hr2IiIjEkIJeREQk\njSnoRURE0piCXkREJI0p6EVERNKYgl5ERCSNKehFRETSmIJeREQkjSnoRURE0piCXkREJI0p6EVE\nRNKYgl5ERCSNKehFRETSmIJeREQkjSnoRURE0lharkdvZruBz6N4yNrA91E8Xnml8xg5ncPI6RxG\nTucwctE+hw2dc3WCbUjLoI82M1vjnGuZ6DpSnc5j5HQOI6dzGDmdw8jF8xzq0r2IiEgaU9CLiIik\nMQV9eJ5PdAFpQucxcjqHkdM5jJzOYeTidg51j15ERCSNqUcvIiKSxhT0fsysk5ltNrOtZnZ3kO2V\nzexV3/YPzKxR/KtMbmGcw2FmtsHM1pvZMjNrmIg6k1lJ59BvvxvMzJmZRj8HEc55NLPf+34ePzGz\nqfGuMdmF8e+5gZmtMLN1vn/TnRNRZ7Iys/Fm9p2ZfRxiu5nZU77zu97MLopJIc45fXi3LzKBbcCZ\nQCXg/wHNi+xzO/Cc7+tewKuJrjuZPsI8h22Bqr6vb9M5LP059O1XHVgNvA+0THTdyfYR5s9iY2Ad\nUMv3+heJrjuZPsI8h88Dt/m+bg58lui6k+kDuBK4CPg4xPbOwELAgFbAB7GoQz36ApcCW51z251z\nR4FXgG5F9ukGTPR9PRNoZ2YWxxqTXYnn0Dm3wjn3s+/l+0D9ONeY7ML5OQR4CHgEOBzP4lJIOOfx\nVmCMc24vgHPuuzjXmOzCOYcOONn3dQ3gqzjWl/Scc6uBH4rZpRswyXneB2qaWb1o16GgL3A6sNPv\n9S5fW9B9nHO5wD7g1LhUlxrCOYf+BuH9NSsFSjyHvst7v3LOvRHPwlJMOD+LTYAmZvaOmb1vZp3i\nVl1qCOccjgD6mNkuYAEwND6lpY3S/s4skwrRPqBIOMysD9ASaJ3oWlKJmWUAjwP9E1xKOqiAd/m+\nDd6VpdVmdr5z7seEVpVaegMTnHP/MrPLgMlmdp5zLi/RhUkB9egLfAn8yu91fV9b0H3MrALepao9\ncakuNYRzDjGz9sC9QFfn3JE41ZYqSjqH1YHzgJVm9hnefb25GpAXIJyfxV3AXOfcMefcDmALXvCL\nJ5xzOAiYDuCcew/IwpvDXcIT1u/MSCnoC3wINDazM8ysEt5gu7lF9pkL9PN93QNY7nwjKgQI4xya\n2YXAWLyQ1z3RQMWeQ+fcPudcbedcI+dcI7xxDl2dc2sSU27SCuff8xy83jxmVhvvUv72eBaZ5MI5\nh18A7QDMrBle0O+Oa5WpbS5ws2/0fStgn3Pu62h/E12693HO5ZrZEGAR3mjT8c65T8xsJLDGOTcX\neBHv0tRWvAEWvRJXcfIJ8xyOAqoBM3zjGL9wznVNWNFJJsxzKCUI8zwuAq4ysw3AceAvzjldofMJ\n8xzeCbxgZnfgDczrr85PATObhvfHZG3fOIYHgIoAzrnn8MY1dAa2Aj8DA2JSh/6fiIiIpC9duhcR\nEUljCnoREZE0pqAXERFJYwp6ERGRNKagFxERSWMKehEplpn1962S1z/RtYhI6SnoRSQpmdlKM9Pz\nvyIR0oQ5IlKS2Xgz8EV9xi4RiT0FvYgUyzm3D2+lRhFJQbp0L1LOmFkj3z33CWZ2jpnNMbMfzOyg\nmb1tZlcV2b/QPXozyzKzH83sO9/iTsG+x7O+93Qp0t7OzN70fb8jZrbFzP5pZjWK1odvZUPfcU58\nrIzy6RBJewp6kfLrDOA94BS8hYZmABcDC82sZ6g3OecOA68CdYCri243s8pAT+Bb4E2/9sHAEuB/\n8BaUGY23ZsTfgHfNrKZv1x+BB4HPfa8f9PuYUKb/UpFyTHPdi5QzZtYI2OF7+Zhz7i9+21rihf8B\noKFzbr+vJ/8SMMA5N8G332XAu8BrzrkeRY7/O7ylSx93zt3pa2uItwzsEeBS59wmv/2fAW4DXnDO\n5fi1rwRaO+csWv/tIuWRevQi5dc+YKR/g2+52ylATaB7qDf61h7fAlxrZqcU2XxiKeeJfm19gErA\n0/4h73Mv8BPQ13c1QESiSEEvUn595Jz7KUj7St/nC0t4/0S88M5frtnM6gIdgXXOufV++17k+7y8\n6EGcc3uBdXhrmZ8TVuUiEjYFvUj59W2I9m98n2uE2H7CJCCPgh48wE14T/NMLLLviWOFekTvRHvN\nENtFpIwU9CLlV90Q7b/0fS72kTrn3C68HvqlZnaiJ94POAZMLbL7iWP9kuDqhfM9RaT0FPQi5ddF\nZlY9SHsb3+d1YRxjgu9zPzO7APg1sNA5t7vIfieO1aZIO77R9hcAh4GNfpuO+7ZnhlGHiISgoBcp\nv2oA9/s3+Ebd34TXs54dxjFmAfvxBtv197VNCLLfy3g9/aFmdnaRbQ8BJwMvO+eO+LXv8X1uEEYd\nIhKCZsYTKb9WA7eY2W+Ad/Aun/fE6wAMds7tL+kAzrlDZjYDGATcjhfObwTZ7zMz+zMwBvjIzKYD\nu/EmxbkM2IT3PL2/ZcDvgFlmtgA4BHzunJtclv9YkfJKPXqR8msHcDmwF/g/wO+Bj4DOzrlXS3Gc\nCb7PFYFpzrmjwXZyzj2DNyL/feAGYBjwC2AUcJlz7ocibxkH/APvysNf8Xr+g0pRl4igCXNEyh2/\nCXMmOuf6J7QYEYk59ehFRETSmIJeREQkjSnoRURE0pju0YuIiKQx9ehFRETSmIJeREQkjSnoRURE\n0piCXkREJI0p6EVERNKYgl5ERCSN/X+cKf/vaklWVQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(8, 8))\n", + "results = pd.concat(dfs)\n", + "pivot_plot(results, fig=fig);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-slideshow", + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/learning/Full_model_LASSO.Rmd b/doc/source/learning/Full_model_LASSO.Rmd new file mode 100644 index 000000000..31c9d66a2 --- /dev/null +++ b/doc/source/learning/Full_model_LASSO.Rmd @@ -0,0 +1,145 @@ +--- +jupyter: + jupytext: + cell_metadata_filter: all,-slideshow + formats: ipynb,Rmd + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Inference in the full model + +This is the same example as considered in [Liu et al.](https://arxiv.org/abs/1801.09037) though we +do not consider the special analysis in that paper. We let the computer +guide us in correcting for selection. + +The functions `full_model_inference` and `pivot_plot` below are just simulation utilities +used to simulate results in least squares regression. The underlying functionality +is contained in the function `selectinf.learning.core.infer_full_target`. + +```{python} +import functools + +import numpy as np, pandas as pd +import matplotlib.pyplot as plt +# %matplotlib inline +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance # to generate the data +from selectinf.learning.core import normal_sampler # our representation of the (limiting) Gaussian data + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.Rfitters import logit_fit +``` + +We will know generate some data from an OLS regression model and fit the LASSO +with a fixed value of $\lambda$. In the simulation world, we know the +true parameters, hence we can then return +pivots for each variable selected by the LASSO. These pivots should look +(marginally) like a draw from `np.random.sample`. This is the plot below. + +```{python} +np.random.seed(0) # for replicability + +def simulate(n=100, + p=20, + s=5, + signal=(0.5, 1), + sigma=2, + alpha=0.1, + B=4000, + verbose=False): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + + # this declares our target as linear in S where S has a given covariance + sampler = normal_sampler(S, covS) + + def base_algorithm(XTX, lam, sampler): + + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) + problem = rr.simple_problem(loss, pen) + soln = problem.solve(max_its=50, tol=1.e-6) + success += soln != 0 + + return set(np.nonzero(success)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + lam = 3.5 * np.sqrt(n) + selection_algorithm = functools.partial(base_algorithm, XTX, lam) + if verbose: + print(selection_algorithm(sampler)) + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + sampler, + success_params=(1, 1), + B=B, + fit_probability=logit_fit, + fit_args={'df':20}) +``` + +Let's take a look at what we get as a return value: + +```{python} +while True: + df = simulate(verbose=True) + if df is not None: + break +df.columns +``` + +```{python} +dfs = [] +for i in range(10): + df = simulate() + if df is not None: + dfs.append(df) +``` + +```{python} +fig = plt.figure(figsize=(8, 8)) +results = pd.concat(dfs) +pivot_plot(results, fig=fig); +``` + +```{python collapsed=TRUE} + +``` diff --git a/doc/source/learning/Full_model_LASSO.ipynb b/doc/source/learning/Full_model_LASSO.ipynb new file mode 100644 index 000000000..49845025b --- /dev/null +++ b/doc/source/learning/Full_model_LASSO.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inference in the full model\n", + "\n", + "This is the same example as considered in [Liu et al.](https://arxiv.org/abs/1801.09037) though we\n", + "do not consider the special analysis in that paper. We let the computer\n", + "guide us in correcting for selection.\n", + "\n", + "The functions `full_model_inference` and `pivot_plot` below are just simulation utilities\n", + "used to simulate results in least squares regression. The underlying functionality\n", + "is contained in the function `selectinf.learning.core.infer_full_target`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", + " from numpy.core.umath_tests import inner1d\n", + "Using TensorFlow backend.\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:455: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:456: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:457: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:458: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:459: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", + "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:462: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", + "R[write to console]: Loaded gbm 2.1.5\n", + "\n", + "R[write to console]: randomForest 4.6-14\n", + "\n", + "R[write to console]: Type rfNews() to see new features/changes/bug fixes.\n", + "\n" + ] + } + ], + "source": [ + "import functools\n", + "\n", + "import numpy as np, pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import regreg.api as rr\n", + "\n", + "from selectinf.tests.instance import gaussian_instance # to generate the data\n", + "from selectinf.learning.core import normal_sampler # our representation of the (limiting) Gaussian data\n", + "\n", + "from selectinf.learning.utils import full_model_inference, pivot_plot\n", + "from selectinf.learning.Rfitters import logit_fit" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will know generate some data from an OLS regression model and fit the LASSO\n", + "with a fixed value of $\\lambda$. In the simulation world, we know the\n", + "true parameters, hence we can then return\n", + "pivots for each variable selected by the LASSO. These pivots should look\n", + "(marginally) like a draw from `np.random.sample`. This is the plot below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(0) # for replicability\n", + "\n", + "def simulate(n=100, \n", + " p=20, \n", + " s=5, \n", + " signal=(0.5, 1), \n", + " sigma=2, \n", + " alpha=0.1, \n", + " B=4000,\n", + " verbose=False):\n", + "\n", + " # description of statistical problem\n", + "\n", + " X, y, truth = gaussian_instance(n=n,\n", + " p=p, \n", + " s=s,\n", + " equicorrelated=False,\n", + " rho=0.5, \n", + " sigma=sigma,\n", + " signal=signal,\n", + " random_signs=True,\n", + " scale=False)[:3]\n", + "\n", + " dispersion = sigma**2\n", + "\n", + " S = X.T.dot(y)\n", + " covS = dispersion * X.T.dot(X)\n", + " \n", + " # this declares our target as linear in S where S has a given covariance\n", + " sampler = normal_sampler(S, covS) \n", + "\n", + " def base_algorithm(XTX, lam, sampler):\n", + "\n", + " p = XTX.shape[0]\n", + " success = np.zeros(p)\n", + "\n", + " loss = rr.quadratic_loss((p,), Q=XTX)\n", + " pen = rr.l1norm(p, lagrange=lam)\n", + "\n", + " scale = 0.\n", + " noisy_S = sampler(scale=scale)\n", + " loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)\n", + " problem = rr.simple_problem(loss, pen)\n", + " soln = problem.solve(max_its=50, tol=1.e-6)\n", + " success += soln != 0\n", + " \n", + " return set(np.nonzero(success)[0])\n", + "\n", + " XTX = X.T.dot(X)\n", + " XTXi = np.linalg.inv(XTX)\n", + " resid = y - X.dot(XTXi.dot(X.T.dot(y)))\n", + " dispersion = np.linalg.norm(resid)**2 / (n-p)\n", + " \n", + " lam = 3.5 * np.sqrt(n)\n", + " selection_algorithm = functools.partial(base_algorithm, XTX, lam)\n", + " if verbose:\n", + " print(selection_algorithm(sampler))\n", + " # run selection algorithm\n", + "\n", + " return full_model_inference(X,\n", + " y,\n", + " truth,\n", + " selection_algorithm,\n", + " sampler,\n", + " success_params=(1, 1),\n", + " B=B,\n", + " fit_probability=logit_fit,\n", + " fit_args={'df':20})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at what we get as a return value:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{18, 13, 14}\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['B', 'alpha', 'coverage', 'id', 'length', 'lower', 'nfeature',\n", + " 'nsample', 'pivot', 'pvalue', 'target', 'upper', 'variable',\n", + " 'bonferroni_coverage', 'bonferroni_length', 'bonferroni_lower',\n", + " 'bonferroni_pvalue', 'bonferroni_upper', 'naive_coverage',\n", + " 'naive_length', 'naive_lower', 'naive_pivot', 'naive_pvalue',\n", + " 'naive_upper'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "while True:\n", + " df = simulate(verbose=True)\n", + " if df is not None:\n", + " break\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonathantaylor/git-repos/selectinf/selectinf/distributions/discrete_family.py:86: RuntimeWarning: divide by zero encountered in log\n", + " self._lw = np.array([np.log(v) for v in xw[:,1]])\n" + ] + } + ], + "source": [ + "dfs = []\n", + "for i in range(10):\n", + " df = simulate()\n", + " if df is not None:\n", + " dfs.append(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfoAAAHpCAYAAABqV/58AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3xUVfrH8c9JJUGKdAQiooBrV1DX\nBjEJCBFpgmJDxMqKujYUFcQuylpwsWIBlQVlQXqREOBnQSy4qKiISm/SWwgp5/fHxJEkEzJJZube\nmfm+X6+8wpy5ufcJM5nnPvece46x1iIiIiKRKcbpAERERCR4lOhFREQimBK9iIhIBFOiFxERiWBK\n9CIiIhFMiV5ERCSCxTkdQDDUq1fPNm/e3OkwREREQuLrr7/eaq2t7+u5iEz0zZs356uvvnI6DBER\nkZAwxqwu6zlduhcREYlgSvQiIiIRTIleREQkginRi4iIRDAlehERkQgWkaPu/bF79262bNlCXl6e\n06FIJcTHx9OgQQNq1qzpdCgiIq4WlYl+9+7dbN68mSZNmpCUlIQxxumQpAKsteTk5LB+/XoAJXsR\nkcOIykv3W7ZsoUmTJiQnJyvJhyFjDMnJyTRp0oQtW7Y4HY6IiKtFZaLPy8sjKSnJ6TCkipKSktT1\nIiJSjqhM9IAq+Qig11BEpHxRm+hFRESigRK9iIhIBFOiD1PDhg2jXr16TocRNN9//z3GGBYsWOB0\nKCIiYU2JXkREJII5muiNMW8ZY7YYY74v43ljjBlpjFlpjFlmjDkj1DFKcXl5eRQUFDgdhoiI+Mnp\nCXPeAf4NjC3j+c5Ay6Kvs4FXir5LObZv387999/PlClT2LVrF2eccQbPP/88Z5/913/fv/71L8aP\nH8+KFSuoVq0aZ511Fs8//zzHHXecd5vU1FTq1atHx44dGT58OKtWrWLVqlW8+eab/Pvf/+bjjz9m\nwIABLFu2jNatWzNy5EguuOCCYrGMHj2a559/npUrV9KoUSNuvfVWBg0aVGybl19+maeeeort27eT\nlpbG7bffHtz/IAlv1sKLL8KkSbB7t9PRiJQp5wBs3gS5ub6f3/S3C2n/zfNBjcHRRG+tXWSMaX6Y\nTboBY621FlhsjKltjGlsrd0YyDjcdJeWtVXfR25uLhkZGezcuZNnn32WBg0a8Morr5CRkcEvv/xC\no0aNAFi3bh0DBw7k6KOPZvfu3bz66quce+65/PLLL9SqVcu7v08//ZRff/2V4cOHk5yc7H1u//79\nXHvttdx55500atSIRx55hJ49e7J69WqSk5MBePbZZ3nggQcYNGgQqampfP311wwZMoTk5GQGDhwI\nwJQpU7j11lu55ZZb6N69OwsXLqR///5V/4+QyPXCC3DXXU5HIVKuJKD5IY9zgD+AlKLHOzYdE/wg\nrLWOfhX9H3xfxnPTgfMPeZwFtC1vn23atLGHs3z58mKPPenVHV/+evjhh23dunV9Pjd69GgbHx9v\nV6xY4W3Ly8uzLVq0sPfcc4/Pn8nPz7f79++3RxxxhB0zZoy3vX379rZatWp206ZNpY4P2KysLG/b\n0qVLLWBnzZplrbV2165dtnr16nbYsGHFfnbIkCG2YcOGNj8/31pr7Zlnnmk7depUbJsbbrjBAjY7\nO/uw/w8lX0uJEmec4fwfq770VcGvvWDTwKaAXVXU9nnj7gH5kwC+stZ3ToyYwXjGmJuMMV8ZY776\n448/nA7HUfPmzaNNmzYcc8wx5Ofnk5+fD0D79u356quvvNstXryYDh06ULduXeLi4khOTmbv3r2s\nWLGi2P7atGlDw4YNSx0nISGB1NRU7+MTTjgB8FwpAPj888/Zt28fvXv39saRn59PWloamzdvZt26\ndeTn5/PNN9/QrVu3Yvvu2bNnQP4vJEKtXOl0BCIVsgdPX/R8YA1wIRCqTien++jLsx5odsjjpkVt\npVhrXwdeB2jbtq0NfmjutXXrVhYvXkx8fHyp54499lgA1qxZQ8eOHTnrrLN47bXXOOqoo0hISODi\niy/mwIEDxX7GV5IHqFGjBjExf50rJiQkAHh/fuvWrQCceOKJPn9+7dq1JCYmUlBQQIMGDYo9V/Kx\niNfOneqXl7CyC0+S//yQthuBUC3H5fZEPxUYaIwZj2cQ3i4b4P75SFSnTh3atm3LK6+8Uuq5xMRE\nAGbPns3+/fuZMmUK1atXByA/P5/t27eX+pnKTjVbp04dAKZPn+7zZKF169YkJSURGxtbanEaLVYj\nZVq1qnTb0UfDRx+FPBSRw7nxRvj8q138xj/IYbm3/b5r7qLnxdfwM9Ckca2ydxAgjiZ6Y8x/gFSg\nnjFmHfAwEA9grX0VmAlkAiuB/cB1wYjDRlj9n56ezty5c0lJSSmzMs7JySEmJoa4uL/eAh988IH3\nMn8gnHPOOSQlJbFhwwYuvvjiMrc7/fTTmTJlCrfccou3bdKkSQGLQyLM6tWl21q2hNNOC30sIocx\nd8tW1nAdHJLkBw9+iSefHBjSOJwedX9FOc9b4NYQhRN2Dh48yMSJE0u1d+7cmVdffZXU1FTuuece\nWrRowbZt21iyZAmNGjXizjvvJC0tjYKCAq677jquv/56fvjhB0aMGEHt2rUDFl/t2rUZNmwYd9xx\nB6tXr6Zdu3YUFhayYsUKsrOzmTx5MgAPPPAAPXv2ZMCAAfTo0YOFCxcye/bsgMUhEaasil7ERTZs\n2MyaNRnAodPEvMbgwTeFPBa3X7qXw9izZw+9e/cu1Z6dnU12djZDhw7l4YcfZvPmzTRo0ICzzjqL\nrl27AnDyySfzzjvvMGzYMCZPnsypp57Khx9+yOWXXx7QGAcNGsRRRx3F888/z7/+9S+qVatGq1at\nih2nR48evPTSSzz99NOMGTOG1NRU3nzzTS666KKAxiIRwleib9481FGIlKmwsJDOnbvwV5I3wJvU\nqXMdNWqEPh5jI+26NZ7BeIeOLi/pxx9/5G9/+1sII5Jg0WsZhXr2hKKrQV5jx8I11zgTj4gPL744\nn3/+82LgIJ454a7i9NPhm2+CczxjzNfW2ra+nlNFLyLhRRW9hIG6ddOAKXjG3HuuvDr1NlWiF5Hw\n4mswnvroxWHW2mJ3KHneph2LbePU2zRiJswRkSiwZw+UvAU0Lg6OOsqZeESAn3/+mfPPP59Vh1xt\nctOFJyV6EQkfvqr5pk09yV7EAcuXL6d9+/Z89tlnpKWlsXbtWsBdF56U6EUkfLipTJKot2zZMlJT\nU9m8eTMAmzdv9lb1bnqrKtGLSPjwVSYp0YsDvvnmGy688EL+XFvliCOOYPbs2VxwwQUUFqqiFxGp\nHE2WIy6wZMkS0tPTvVOG16pVi48//pgLLrgAgM2b4eDB4j9TsyYEcD6yClHHloiED1X04rBPP/2U\nzp07s2fPHgCOPPJIPv74Y9q0aePdpqzz0UouG1JlquhFJHyoohcHLViwgIsuusib5OvVq0d2dnax\nJA/uOx9Vog9Tw4YNwxjjc5rYXr16FVsnvjyrVq3CGMP06dMDGKFIELjtE1SixtKlS8nMzGTfvn2A\nZ/nuBQsWcOqpp5ba1m3no0r0YW7u3Ll8+eWXVdpH48aN+fzzzzn//PMDFJVIEOzfDyWXL46J8dxe\nJxJkJ510krewOuqoo1i4cCEnnniiz23ddj6qRB/G6tSpw8knn8wTTzxRpf0kJiby97//PaAr14kE\n3Jo1pduaNIH4+NDHIlEnPj6e8ePHc+ONN7Jw4UJat25d5raq6CVgjDE8+OCDTJ06le+++87nNhs3\nbqR///60aNGCpKQkWrVqxUMPPcTBQ4aElrx0369fP84888xS+xo1ahTJycne/qnCwkKefvppjjvu\nOBITE2nVqhVjxowJwm8qgvs+PSXqJCYm8vrrr3PccccddjtV9BJQvXv3pmXLlmVW9Vu3bqVOnTo8\n99xzzJ49m3vvvZe3336b2267rcx9Xn755Xz11Vf8/vvvxdonTJhAZmYmNYrWWbztttt4/PHHuemm\nm5gxYwY9evSgf//+6uuX4HDbp6dEtHHjxjFkyBAqusKrte47J9XtdeDcPQ++VPBNFRMTw+DBg7n+\n+ut59NFHadWqVbHnTz75ZEaMGOF9fN5551G9enX69+/PSy+9REJCQql9dujQgbp16zJhwgTuv/9+\nANavX88nn3zCBx98AMDKlSt55ZVXePvtt7n22msByMjIYOPGjTzyyCN06dKlQr+HSLnc9ukpEWvM\nmDFcd911WGuJj49n6NChfv/s1q2Qk1O8LTkZ6tULcJAVoIo+Alx99dWkpKTw1FNPlXrOWssLL7zA\nCSecQFJSEvHx8Vx11VXk5uayxlefJxAXF0fPnj2ZMGGCt+3DDz+kevXqXHzxxQBkZWURExNDjx49\nyM/P936lp6fz7bffUlBQEJxfVqKXKnoJgTfeeMOb5MHz2ffnSHt/uO0eelCijwhxcXEMGjSI9957\nj9UlPgxfeOEF7rnnHnr06MGUKVNYsmQJo0aNAuDAgQNl7rNPnz58++23rFixAvBctu/atStJSUmA\np0ugoKCAWrVqER8f7/3q168f+fn5bNy4MUi/rUQtN00eLhFp1KhR3HTTTd4kf9ppp5GdnU316tX9\n3ocbz0d16T5C9O/fn8cff5zhw4cXa//www/p1atXsT785cuXl7u/9u3b07BhQyZMmEDfvn1ZvHgx\ngwcP9j5fp04d4uLi+PTTT4mJKX2+2KBBgyr8NiI+uGnycIk4zz33HHfffbf3cdu2bZkzZw516tSp\n0H7c2MOkRB8hEhMTueeeexg8eDBt2rQhvuiWo5ycHBITE4tt+/7775e7v9jYWHr37s2ECROoVq0a\ntWvXplOnTt7n09LSKCgoYNeuXXTo0CGwv4xISbm5sGFD6faUlNDHIhHn6aefLlbI/P3vf2f27NnU\nqlWrwvty44UnXboHzwA4t3xVwc0330yNGjX47LPPvG0dOnRgwoQJvPzyy8yZM4e+ffuycuVKv/Z3\n+eWX88MPP/D888/TvXv3YgP3WrduzS233EKfPn0YPnw4WVlZzJgxg2eeeYYbbrihSr+HSClFa3wX\n07gxlDiJFakIay2PPvposSR/wQUXMHfu3EoleXDnhScl+giSnJzMnXfeWaxt6NChXHHFFTz00ENc\nccUVJCQkMHLkSL/2d95559GsWTM2btxInz59Sj0/atQohgwZwtixY8nMzKRfv37MmDGDdu3aBeT3\nEfFyY5kkYe+JJ57g4Ycf9j6+8MILmTVrlvcW4spw41vVVPQewXDQtm1b+9VXX5X5/I8//sjf/va3\nEEYkwaLXMkq8+SaUvFLUpw/85z/OxCMR4ZNPPqFTp07s27ePjh07MnnyZJKTkyu9P2uhVi0omlPM\na8MGzwWoYDLGfG2tbevrOVX0IuJ+biyTJOydf/75TJ8+nd69ezNlypQqJXmAHTtKJ/nERGjYsEq7\nrTINxpPIN2MGPPus77nSJTxs3Vq6zemOzwg0bhyMGgXRdXdsKpDKCSdUfU95eaXbUlI8ay85SYle\nItv330PXrlBY6HQkEmiq6ANqwQK46iqnowimAmAwcCPQMmRHdcPbVJfuJbL9979K8pFKFX1AHTIR\nZgTKB/oBzwJpwO+H3TqQ3PA2VaKXyPbrr05HIMHQrBmUWNdBqiZy/1TygKuA94oerwNeD9nR3TDN\nSNReurfWYty0mI1UmF93jPi6qVXCW6NG8N57EBvrdCQRJTL/VA4CfYDJh7TdDPhe7TOQYmPhxhuh\nZ8+gH6pcUZno4+PjycnJqfIIS3FWTk6OdwbAMvkarf3xx9CiRVBikiCLi/NU8zpJD6jCQt+JfulS\nqFkz9PEEQm7uAW69tRfZ2TO8bX373sbQoS+GpMhr0ACOOCLoh/FLVCb6Bg0asH79epo0aUJSUpIq\n+zBjrSUnJ4f169fT8HD3reTnw/r1pdvPOQcqsEiFSKTbssUzy/ChatSAU08Nz3Oq/fv306NHD7Kz\n53rb7rnnHp555pmo/LyPykRfs+gUdcOGDeT5uh9CXC8+Pp6GDRt6X0uf1q2Dksvl1q+vJC9SQlnT\nFIRjTty3bx+XXHIJ2dnZ3rYHH3yQxx57LCqTPERpogdPsj9skpDw58ZJp0VcKFL+VPbu3Uvnzp35\n5JNPvG2PPPIIQ4cOdTAq50VtopcooNnURPwSKX8q1apVo0mTJt7HTz31FPfff7+DEbmDEr1Erkj5\n9BIJskip6OPi4nj33XfJy8vj/PPPL7XIV7RSopfIFSmfXiJBFknnxPHx8Xz44YfEOD3vrIvof0Ii\nVyR9eokEUbieE2/evJkRI0aUmlNDSb44VfQSucL100skhKwNz3PiDRs2kJ6ezk8//cTWrVt56qmn\nonZUfXl02iORqaDA92p1SvQixWzbBvv3F29LSoJ69ZyJxx9r166lffv2/PTTTwCMGDGC7777zuGo\n3EuJXiLTxo2eCXMOdeSR4TvNl0iQhNs99L///jvt2rVj5cqVgGcA3vjx4znllFMcjsy9dOleIlM4\nXosUcUA49XCtXLmStLQ01q5dC/w18K5bt24OR+ZuSvQSmcLp00vEQeFyTvzTTz+Rnp7Ohg0bAEhM\nTGTy5Ml07tzZ4cjcT4leIlO4fHqJOCwczom///57MjIy2Lx5MwBJSUlMnTqVjIwMhyMLD0r0EpnC\n4dNLxAXcfk78v//9j4yMDLZu3QpA9erVmT59Oqmpqc4GFkaU6CUyuf3TS8QlfJ0Tu+lP5eDBgxw8\neBCAGjVqMGvWLM477zyHowovGnUvkcntn14iLuHrnNhNF7/OPPNMZs2aRdOmTfn444+V5CtBFb1E\nnsJCXboX8cPOnbB7d/G2xERo2NCZeMpy7rnnsnLlShITE50OJSypopfIs3kz5OYWb6tZE2rXdiYe\nEZfyVc2npICTM8guWLCA5cuXl2pXkq88JXqJPGVV826dAUTEIW7r4Zo7dy6dO3cmPT2dFStWOBdI\nhFGil8ijgXgifnFT//yMGTO45JJLOHDgAJs2baJv376lFquRylGil8ij/nkRv7ilov/oo4/o0aOH\nd3R9SkoK77//vhapCRAleok8quhF/OKGiv7DDz+kd+/e5OXlAXDMMcewaNEijj322NAGEsGU6CXy\nqKIX8YvTFf37779Pnz59yC9agKply5YsWrSIo/X3GlBK9BJ5VNGL+MXJiv7tt9/mmmuuobCwEIC/\n/e1vLFy4kKZNm4YmgCiiRC+RxVpV9CJ+2LMHtm8v3hYXB0cdFfxjv/baa/Tv39872O7kk09mwYIF\nNG7cOPgHj0JK9BJZtm6F/fuLtyUnQ716zsQj4lK+zoebNYPY2OAe11rLF1984X182mmnMX/+fBo0\naBDcA0cxzYwnkaWsTkeN3hUpxqkeLmMMb7zxBrm5ufzyyy/MmTOHI488MvgHjmJK9OKs3Fy4916Y\nOhX27q36/opuzylGl+1db+dOGDgQsrNLT2oowXHgQOm2UP2pxMbGMmbMGHJycqhRo0ZoDhrFlOjF\nWUOHwksvBfcYGojnejfcAP/9r9NRSDD+VKy1zJw5k8zMzGL3xcfFxSnJh4j66MVZkyYF/xiq6F0t\nL89zQUecF+g/FWstDz74IF26dOGf//ynZrpziBK9OKegwHdHYaB16hT8Y0ilrV/vSfbirJgY6NAh\ncPuz1nLPPffw1FNPATBy5EjeeuutwB1A/KZEL87ZsAGKJsoIitq14YUX4NRTg3cMqbJQnOvJ4dWv\nD++8A02aBGZ/hYWF3H777Tz33HPeti5dunDVVVcF5gBSIeqjF+f4+oQ/5RTIygrM/o88Mvj3CkmV\n+XobdOsGo0eHPJSoVbdu4G5MKSws5JZbbuGNN97wtvXs2ZP//Oc/JCQkBOYgUiFK9OIcX7fCHXus\n7nmPMr7eBi1b6m0QjgoKCrjhhht45513vG2XX3457777LvHx8c4FFuV06V6co6lqBb0NIkV+fj59\n+/YtluSvueYa3nvvPSV5hynRi3M0Va2gt0EkyMvL48orr2TcuHHetv79+/P2228TF6cLx05Tohfn\nqJQT9DaIBKtWrWLevHnex3/20cdqjIwrKNGLc1TKRb2CAli7tnS73gbhpWXLlnz88cfUqlWL22+/\nnZdffpmYGKUXt9A1FXFGYaHzi2GL43zdYVmnDmjCtPDTpk0bvv32W44++uhiM+CJ8xw/5TLGdDLG\n/GyMWWmMud/H8ynGmGxjzFJjzDJjTKYTcUqAbd5cel76mjU9975L1NBFnfC0d+9efvzxx1LtzZs3\nV5J3IUcTvTEmFhgFdAZOAK4wxpxQYrOHgA+stacDfYCXQxulBIU6ZgW9DcLR7t276dSpE+3ateP7\n7793Ohzxg9MV/VnASmvtb9bag8B4oFuJbSxQs+jftYANIYxPgkWlnKC3QbjZuXMnHTt25NNPP2Xr\n1q1kZGSwY8cOp8OScjjdR98EOHQozjrg7BLbDAPmGmNuA6oDGaEJTYJKpZygt0E42bZtGx07duSb\nb77xtg0ePFhryYcBpyt6f1wBvGOtbQpkAu8aY0rFbYy5yRjzlTHmqz/++CPkQUoF+fqEVykXdZTo\nw8Mff/xBWlpasST/8ssvc8cddzgYlfjL6US/Hmh2yOOmRW2Huh74AMBa+zlQDSg1Oaa19nVrbVtr\nbdv69esHKVwJGI24F3TpPhxs2rSJ1NRUli1bBoAxhtGjRzNgwACHIxN/OZ3ovwRaGmOOMcYk4Bls\nV3Jl6jVAOoAx5m94Er1K9nCnij7q6Q5L91u/fj3t27dn+fLlAMTExDBmzBiuv/56hyOTinC0j95a\nm2+MGQjMAWKBt6y1PxhjHgW+stZOBe4G3jDG3IlnYF4/a611LmqpMmv1CS+6w9Ll1qxZQ1paGr/+\n+isAsbGxvPfee/Tp08fhyKSinB6Mh7V2JjCzRNvQQ/69HDgv1HFJEP3xB+TkFG9LTvaslSlRQ/3z\n7jZ69Ghvko+Li2P8+PFceumlDkclleH0pXuJRmVV85poI6qof97dhg0bRv/+/UlISGDSpElK8mFM\niV5CT/3zgip6t4uJieH1119n8eLFXHLJJU6HI1WgRC+hp/55QRW926xevZrCwsJibbGxsZx++ukO\nRSSBokQvoaeKXlBF7yZLly6lTZs23HzzzaWSvYQ/JXoJPVX0gip6t/jyyy9JS0tj27ZtjB49mvvv\nL7W2mIQ5JXoJPVX0Uc9aVfRu8Pnnn5ORkcHOnTsBqF27Nr1793Y4Kgk0JXoJLd1DL8DWraXvsKxe\nXXdYhtKiRYvo2LEju3fvBqBu3brMnz+fM8880+HIJNCU6CW0duyAPXuKtyUmQoMGzsQjjijroo7u\nsAyNrKwsOnfuzN69ewGoX78+2dnZGngXoZToJbTK+oSP0VsxmuiyvXPmzJlDly5d2L9/PwCNGjVi\nwYIFnHzyyQ5HJsGiT1cJLY3AEvQ2cMq0adPo2rUrBw4cAKBJkyYsXLiQE044weHIJJiU6CW0VMoJ\nehs4IScnhwEDBnCwaIGBo48+mkWLFtGqVSuHI5NgU6KX0FIpJ+ht4ISkpCRmzpxJnTp1aNGiBQsX\nLqRFixZOhyUh4PiiNhJmfv4ZbrwRvvwSCgoq/vP5+aXbVMqFnU8+gdtvhx9+8NxIUVF5eaXb9DYI\nvlNOOYWsrCzq169PkyZNnA5HQkSJXvxnLXTtCitWBHa/KuXCyt69kJlZ+uaJqtLbIPC2b99OnTp1\nirWddtppDkUjTtGle/Hfzz8HPsmDSrkws2BB4JN8tWq6wzLQXn31VVq2bMnSpUudDkUcpkQv/vv9\n98Dv89RTQZcQw0ow3gaZmbrDMpBGjhzJgAED2L59Ox06dGD58uVOhyQO0p+W+M/XUOnKiomBs8+G\nCRM0S0qYCeTbID4eLroIXn45cPuMds8++yx33HGH9/Gxxx5L48aNHYxInKY+evGfr6HSDzwADz9c\n8X3FxECc3n7hyNfbYPRouOaaiu8rNtbzJYHx+OOPM2TIEO/jc889l1mzZlGzZk0HoxKn6ZNW/Oer\nlDv2WEhICHko4hxfb4PjjtPbwEnWWh5++GEee+wxb1u7du2YMWMGRxxxhIORiRso0Yv/dPOzoLeB\n21hrGTx4MMOHD/e2paenM2XKFKpXr+5gZOIWSvTiP01nFvX27fOsPHeo2Fho2tSZeKKdtZa77rqL\nF154wdvWqVMnJk2aRFJSkoORiZtoMJ7458AB2LSpeJsx0KyZM/GII3xV802aaLiFUyZPnlwsyXft\n2pWPPvpISV6KUaIX/6xZU7rtqKPUMRtldFHHXXr06MGtt94KwKWXXsqHH35IYmKiw1GJ2+g8XPxT\n1vKyElV8VfRK9M4xxjBy5EhOP/10rr32WuJ0aUV80LtC/KNPeEHne07LL1or4tCEHhMTw/XXX+9U\nSBIGdOle/KNPeEHne07Ky8ujT58+9O/fn4LKLCglUUsVvfhHn/CCzveckpuby2WXXcbUqVMBiI+P\n54033iBG8waLH5ToxT8ahSXofM8JOTk5XHrppcyaNcvbVrNmTYymjhY/6XRQ/KNZUqKe7rAMvf37\n99O1a9diSf6+++7jueeeU6IXvynRS/kOHoT160u3p6SEPhZxjO6wDK29e/eSmZnJvHnzvG1Dhw7l\nqaeeUpKXCtGleynf2rVgbfG2hg1Bk3JEFfXPh86uXbvIzMzks88+87Y9/vjjPPjggw5GJeFKiV7K\np45ZQW+DUNmxYwcXXXQRX375pbft2Wef5Z577nEwKglnSvRSPpVygt4GoXLNNdcUS/Ivvvgit99+\nu4MRSbhTH72UT6WcoLdBqIwYMYIGDRoA8OqrryrJS5WpopfyqZQTdIdlqBx//PFkZWWxdOlSrrnm\nGqfDkQigRC/lUykn6A7LYCksLCw18c1JJ53ESSed5FBEEml06V7Kp4o+6ukOy+BYvXo1bdq0YfHi\nxU6HIhFMiV4OLz8f1q0r3a5EH1XWrdMdloH222+/0a5dO7799ls6derE119/7XRIEqGU6OXw1q+H\nkgto1K0LRxzhTDziCPXPB9aKFSto164da4pmIcrJyWHz5s0ORyWRSoleDk+f8IL65wNp+fLltG/f\nnvVFfSHVqlVj6tSpZGZmOrRq57QAACAASURBVByZRCoNxpPD0ye8oPO9QFm2bBkZGRn88ccfACQn\nJzNt2jTS0tIcjkwimRK9HJ4+4QWd7wXCN998Q4cOHdi+fTsARxxxBDNnzuSCCy5wODKJdLp0L4en\nT3hB53tVtWTJEtLT071JvmbNmsydO1dJXkJCFb0cnj7hBZ3vVcXq1avJyMhgz549ANSuXZuPP/6Y\ntm3bOhyZRAtV9HJ4+oSPWNZ61pgv72vfPs8ChiXpbeCflJQUbr75ZgDq1q1Ldna2kryElCp6KVth\noe9FyPUJH/aeew6eeQYqe0dXvXq6w9JfxhieeeYZkpKSuOyyyzTjnYScEr2UbeNGyMsr3larFtSu\n7Uw8EhCLFsHdd1dtHzrXqxhjDI8++qjTYUiU0qV7KZtmxItIH39c9X0cc0zV9xGppk2bxtVXX01+\nfr7ToYgAqujlcHbuLN1Wr17o45CA2rat6vvo1q3q+4hEkyZN4vLLLyc/Px9rLWPHjiU2NtbpsCTK\nKdFL2XbtKt2my/Zhz9f5W1wc+JOPGjSAG26Aq64KfFzhbvz48Vx99dUUFE0Z/cUXX7Bt2zbv2vIi\nTlGil7L5yghK9GHP18s6aRJccknoY4kUY8eO5brrrqOwsBCA1q1bk5WVpSQvrqA+eimbr4xQq1bo\n45CA0vlbYL355pv069fPm+RPOOEEFixYQJMmTRyOTMRDiV7KpowQkXz1yOj8rXJefvllbrjhBmzR\nGr6nnHIKCxYsoFGjRg5HJvIXJXopm/roI5LO3wLjhRde4NZbb/U+PuOMM5g/fz7169d3MCqR0pTo\npWzKCBFJL2vVvfLKK9x5553ex2effTZZWVnUrVvXwahEfFOil7Kpjz7i5OXB/v3F22JiNMtdRXXs\n2NHbB3/eeecxd+5cautsSVxKiV7KptIv4vjqjalZ05PsxX/HHnssWVlZXH755cyePZuaNWs6HZJI\nmXR7nZRNffQRR+dugdO6dWvGjx/vdBgi5apUojfGnAC0A1KAekAOsAX4Flhkrd0TsAjFOcoKEUcv\nacVZaxk8eDCZmZm0a9fO6XBEKszvRG+MaQrcBPQHGv/ZXGIzCxQYY+YBrwDT7Z/3nUj4UR99xFGi\nr5jCwkIGDhzIK6+8wqhRo5g7dy7nnHOO02GJVEi5id4YUwcYBtwMxAOrgHHAl8AmYDuQBNQFjgfO\nAVKBi4CfjTF3W2tnBT50Caq8PM9C5IcyxtOhK2FL527+Kygo4Oabb+bNN98EYO/evYwePVqJXsKO\nPxX9SiARGA2MsdYuKe8HjDE1gT54rgBMN8bcaa0dWaVIJbR27y7dplFbYU/DLvyTn59P//79effd\nd71tV155Ja+99pqDUYlUjj+J/l3gSWvtZn93aq3dDbwOvG6M6Q5Uq2R84hRd441IelnLl5eXR9++\nfYsNtOvXrx+jR4/WSnQSlspN9NbaO6pyAGvtR1X5eXGIMkJE0st6eAcPHuSKK65g0qRJ3rabbrqJ\nV155hRhdzZIwVeF3rjEmpejS/OG2qWGMSal8WOI4deZGJCX6suXm5tKrV69iSX7gwIG8+uqrSvIS\n1irz7v0dKK/Kv71oOwlXyggRSQva+Jafn0/37t2ZNm2at+3uu+9m5MiRGFPy5iKR8FKZRG8ofVud\nRBqN2opIOn/zLS4ujjPPPNP7+IEHHuDZZ59VkpeIEKyZ8RoB+8rdStxLGSEi6WUt2yOPPEJubi7J\nyckMHTpUSV4ihl+J3hjTt0TTaT7aAGLxzJZ3NfBdFWMTJ6mPPiIp0ZfNGMPTTz+tBC8Rx9+K/h08\ns95R9L1b0VdJf/6F7AceqVJk4ixlhIikPnqP7du388wzz/Doo4+SkJDgbVeSl0jkb6K/rui7Ad4C\nPgKm+NiuANgGfG6t9ZEpSjPGdAJexHM1YLS19mkf21yGZ3Y+C/zPWnuln3FLZamPPiLp/A22bt1K\nhw4d+Pbbb/nll18YP3488fHxToclEjR+JXpr7Zg//22MuRb4yFo7tqoHN8bEAqOADsA64EtjzFRr\n7fJDtmkJDAbOs9buMMY0qOpxxQ/KCBGnoKDsCQ+jxebNm8nIyOD7778HYPLkySxcuJCMjAyHIxMJ\nngoPxrPWXhjA458FrLTW/gZgjBmPp0tg+SHb3AiMstbuKDr+lgAeX8qiRB9xfCX5GjUgLkoWq96w\nYQPp6en89NNPgOcy/VtvvaUkLxGv0n/ixphkoCdwOlAb2AV8A0y21vo74r4JsPaQx+uAs0ts06ro\neJ/iubw/zFo7u7Jxi580GC/iRPNLunbtWtLS0li5ciUAMTExjB07lquuusrhyESCr7Lr0WcCY4A6\nFL+n3gLPG2Ous9ZOD0B84ImxJZ4V8ZoCi4wxJ5ccA2CMuQnPIjqkpGhSvipTH33EidaXdNWqVaSl\npfH77545vOLi4hg3bhy9e/d2ODKR0KjMFLhnAJPwVPHv41mfvnPR9/eL2icaY9r4sbv1QLNDHjct\najvUOmCqtTbPWvs7sAJP4i/GWvu6tbattbZt/fr1K/hbSSm6dB9xovEl/fXXX2nfvr03ycfHxzNx\n4kQleYkqlZkZ70E8lfsF1tq+1tp3rLVzir73Bc4vev4BP/b1JdDSGHOMMSYBz9K2U0ts8xGeah5j\nTD08l/J/q0Tc4q/CQt8dutFynTdCRVuiX7FiBe3atWPNmjUAJCYm8tFHH9Gtm687g0UiV2US/QXA\nh9baxb6etNZ+AUws2u6wrLX5wEBgDvAj8IG19gdjzKPGmK5Fm80BthljlgPZwL3W2m2ViFv8tXs3\nWFu8rXr16Bm1FaGirY++WrVq3nvkq1WrxtSpU8nMzHQ4KpHQq8wndy2KD6DzZQ3g10071tqZwMwS\nbUMP+bcF7ir6klCI1s7cCBdtL2tKSgrz588nMzOTl19+mQsvDOQNQyLhozKJfgOe2+IOpy2wsRL7\nFjeItmu8USIaX9ZjjjmG7777jjhdjZIoVplL9zOBNGPM/UUT3ngZY2KMMXcDGZSo0iWMRGNGiAKR\n/rIuWbKEefPmlWpXkpdoV5m/gMeA7sATwM3GmP/DU703wjMQrzmwCXg8QDFKqEVbZ26UiORE/+mn\nn9K5c2fy8/OZOXMmqampTock4hqVmRlvkzHmPOA1PFPXHl1ik4+BW6y1unQfriI5I0SxSF3QZsGC\nBXTp0oV9+zzzdF133XX8/PPPxRarEYlmlbqmZa1dBVxkjGmCZ2a8WnhmxltqrS15H7yEm2gbtRUl\nIvH8bd68eXTt2pWcnBwAGjZsyLRp05TkRQ5Rpc6roqSuxB5pIjEjSMS9rDNnzqRnz57k5uYC0Lhx\nY+bPn8/xxx/vcGQi7lKZmfE+MMZ0NsZUZiCfhINIywgCRNbLOmXKFLp37+5N8s2aNWPRokVK8iI+\nVCZZ9wKmA+uNMc8aY04KcEziNA3Gi0iR0kc/ceJEevXqRV5eHgDNmzdn0aJFHHfccQ5HJuJOlUn0\nf8czEC8BuBv4nzHmK2PMbUVT1Eq4Ux99xLE2Ms7fxo0bR58+fcjPzwfg2GOPZdGiRTRv3tzZwERc\nrMKJ3lq7xFr7D6AxcBme++VPAV7EU+VPMsZ0N8bo5tVwFUnXeAWAvXs9SxgcKikJEhOdiaeyVq9e\nTUFBAQCtW7dm0aJFNGvWrJyfEolulU7G1tqDeOa0n2iMqQ9cDVyL5x77bsA2oEEggpQQU6KPOJHy\nkg4ePJjc3FwmTpxIVlYWDRs2dDokEdcLyIA6a+0f1trn8dxqdw+QD9QNxL7FAZFwjVeKiaSX9OGH\nH+aLL75QkhfxU0ASvTGmtTHmSWA18CwQD6wMxL7FAeqjjzjh+pL+97//5cCBA8XajDFUr17doYhE\nwk+lE70xprYxZoAxZjGwHLgfz4p1b+JZq751gGKUUIqUUVtSTDheun/66afp1asXvXr14uDBg06H\nIxK2KnMf/SXGmA/xzG//bzwr1c3D00ffyFp7k7X208CGKSGzbx8UDXbyqlbN8yVhK5wSvbWWRx99\nlMGDBwMwY8YMnnjiCYejEglflRmMN6Xo+wpgDDBW095GEFXzESlcEr21loceeognn3zS23bhhRcy\naNAgB6MSCW+VSfSvAWOstYsDHYy4QLh25sphhcNkOdZaBg0axIgRI7xtHTt2ZPLkySQnJzsYmUh4\nq8zqdQOCEYi4RLiUflIhbn9ZrbX885//ZOTIkd62iy++mIkTJ1JN3UYiVaJJbaQ4t2cEqRQ3v6yF\nhYX84x//4LXXXvO29ejRg/Hjx2sVOpEAKDfRG2PmAxa41lq7ruixP6y1Nr1K0UnouTkjSKW59WUt\nKCjgxhtv5O233/a2XXbZZbz33nvEx8c7GJlI5PCnok/Fk+iTD3nsD1uJeMRpGowXkdzaR79jxw7+\n7//+z/v46quv5u233yYuThcbRQKl3L8ma23M4R5LhNFgvIjk1oq+Xr16zJ8/n/bt25Oamsobb7xB\nbGys02GJRBSdNktxbs0IUiVuflmbNWvG4sWLqVevHjExqiNEAk1/VVKcmzOCVJpbXtYDBw7w+eef\nl2pv0KCBkrxIkFRlCtyrjDFZxpjtxpj8ou/zjDFXBTJACTH10Ucct8xqvH//frp160Zqaipz5swJ\n7cFFolhlpsCNN8ZMAcYCFwI1gD+KvqcBY40xU4wxGjIbjtRHH3EOHIC8vOJt8fGe9ehDZd++fXTp\n0oW5c+dy8OBBunfvzq+//hq6AESiWGUq+sHAJcAXeBJ9NWttY6AankS/BOgC3BeoICWE3HKNVwKm\nrJfUmNAcf8+ePXTq1Ins7Gxv2wMPPMCxxx4bmgBEolxlEn1fPEvQplprF1prCwCstQXW2gV4br/7\nDegXoBgllJToI46TL+nOnTvp2LEjn3zyibft6aefZsiQIaEJQEQqleibAlOstT7XjbTW5uJZ+KZJ\nVQIThyjRRxynXtLt27eTkZHB4sV/LYvx3HPPcd99utgnEkqVub1uA1Be/3t80XYSbtw6s4pUmhMv\n6R9//EGHDh343//+523797//za233hrcA4tIKZWp6McBvYwxNX09aYypDfQC3q9KYOKAAwcgN7d4\nW1wcaOWwsBbqin7Tpk1ceOGF3iRvjOH1119XkhdxSGUq+keBk4AlxphHgUXAZqAh0B4YgmdA3mOB\nClKqKCcH1q8vf7utW0u3hXLUVgSzFtauhYM+O7yC65dfSrcFM9HPnj2bH374AYCYmBjeeustrr32\n2uAdUEQOqzKJPqfouwHe9fG8AVoCB0zxBGGttZqJL5SshUcfhSefrHyGUf98lX3zDfTsCatXOx3J\nX4L5svbr148tW7bwwAMP8O6773LFFVcE72AiUq7KJN7/QwvWhIcvv4Rhw6q2D/XPV1n//u5K8hD8\nl3XQoEF07dqV448/PrgHEpFyVTjRW2tTgxCHBMO8eVXfR8OGVd9HFNu6FQ4Zj+YagXxZV61aRYMG\nDUguMZZDSV7EHTS5dCQLRBl56aVV30cUc1slD1CtGnTuHJh9/fTTT5x33nl069aNAwcOBGanIhJQ\n6jOPZKtWlW5r3Ni/UfR168KVV8J11wU8rGji6yWoXh0aNQp5KAC0agWDB0PTplXf1/fff09GRgab\nN29mw4YNXH311UycOLHqOxaRgCo30Rtj7gH+ba2t1Om6MeZ0oJG1dlZlfl6qwFc5OX06nHFG6GOJ\nUr5egquvhldfDX0sgfS///2PjIwMthbdqVG9enUGDhzocFQi4os/l+6fAH41xtxnjDnKn50aj4uM\nMZOBr4BTqxKkVIK1vrNM8+YhDyWa+arow/0l+Prrr7nwwgu9Sb5GjRrMmTOH1NRUZwMTEZ/8uXR/\nMvAc8BTwuDHmM+ATPAl8I7ADz4I2dYHjgb8D6UAjYBswEHgt4JHL4W3Z4pkA51BHHAFHHulMPFEq\n0s61Fi9eTKdOndhVNN1erVq1mDNnDmeffbbDkYlIWcpN9NbaFUAXY8y5wK3ApcAF+L7F7s8b538G\nhgNvW2v3BChWqYiySklNfhNSvl6Go48OeRgB8cknn5CZmcmePZ4/6Tp16jB37lzatGnjcGQicjh+\nD8az1n4GfGaMuQVoB5wPpOCp5HOALcAyYIG19ocgxCoV4auUDNcME8Yi5dL9ggUL6NKlC/v27QOg\nXr16zJs3j1NPVa+ciNtV5j76PcCMoi9xq0jJMGFs507Yvbt4W0JC+E1NUFhYyL333utN8g0bNiQr\nK4sTTzzR4chExB+6jz5SRVrncBgq67J9TJj91cXExDB16lRatWrFUUcdxcKFC5XkRcKIXxW9MaYv\n8K21dlmQ45FAiaTO4TAVSb0njRs3Zv78+eTk5HDcccc5HY6IVIC/tcU7QPdDG4wx1xpj5gc8IgkM\nVfSOC+fek40bN5Zqa9KkiZK8SBiqykXE5niWpRW3sVYVvQuEa0X/3nvv0aJFC6ZNm+Z0KCISAGHW\nWyh+2b4digZOeSUlQf36zsQTpcKxon/77bfp27cvBw4coFevXsyfr4t2IuFOiT4SlVXN6x76kAq3\niv61116jf//+WOuZIqN169acdNJJDkclIlWlRB+J1D/vCuFU0b/00kvccsst3sennXYa8+fPp0GD\nBg5GJSKBUJFE72smPHEj9c87bs8eTw/KoeLi4Ci/VosIrX/961/cfvvt3sdnnnkm8+fPp169eg5G\nJSKBUpEJc4YZY4aVbDTGFJSxvbXWahlcJ6iid5yvl6BZM4iNDX0sh/Pkk0/y4IMPeh+fc845zJo1\ni1q1ajkYlYgEUkUqelPBL3ULOEUVvePcfq5lrWXYsGHFkny7du2YM2eOkrxIhPGr4rbWKmmHk3Dq\nHI5Qbj/X+vLLL3nkkUe8j9PS0pg6dSrVq1d3MCoRCQYl8EgUbsO9I5Dbz7XOOussRo4cCUCnTp2Y\nPn26krxIhFIfeqTZuROK1gr3SkiARo2ciSdKhcO51m233UbTpk3JzMwkMTHR6XBEJEgqnOiNMacC\nVwJnAfXxjMb/A/gCGGet/S6gEUrF+MowKSnht5JKmHNbRV9YWEhOTk6pqr1Hjx4ORSQioeJ3ojfG\nxAIvATfx14C7Q7UH7jXGvAzcYf+cdUNCy20ZJkq5qaIvKCjg+uuv59dff2X27Nm6RC8SZSpS0Y8A\nbgEOAh8AC4D1eBL+UUAa0Au4FTgADApkoOInN2WYKLV/P2zZUrwtJgaaNg19LPn5+Vx77bWMGzcO\ngEsuuYQZM2aQlJQU+mBExBH+LlN7HHAbsBroZK392cdmbxljHgdmA3caY1611v4WuFDFL6roHbdm\nTem2Jk0gPj60ceTl5XHllVcyceJEb1uLFi1ISEgIbSAi4ih/O26vwVO59ysjyQNgrf0JuBaIBa6u\nenhSYaroHeeGc63c3Fx69+5dLMkPGDCA119/nVi3zdojIkHlb6I/F/jRWruwvA2LtlkOnF+VwKSS\n3JBlopzTk+UcOHCAnj17MmXKFG/bHXfcwahRo4jRoEyRqOPvX/3xeEbV++uLop+RUFNF7zgnJ8vZ\nv38/l1xyCTNnzvS2DRo0iOeffx6j1QtFopK/ib42sKXcrf6yGTiy4uFIlezdC9u2FW9z60oqEcyp\nin7v3r1cfPHFzJs3z9s2ZMgQnn76aSV5kSjm76j76kBOBfabCyRXPBzxy4YNsGlT6XZfpWTTpp5k\nLyHjVEV/9913s2DBAu/jxx57jIceeij4BxYRV1MGCCc5OXDFFXBI32u51D8fck5V9I8//jiffvop\nP/zwA8888wz33ntv8A8qIq5XkUTf3RjT3M9tT694KFKusWMrluRB/fMhlpvrueBSUrNmwT92/fr1\nycrKYtasWfTr1y/4BxSRsFCRRH9a0Ze/NDNeoB1yWdZvxx0X8DCkbGvXlm5r3BiCMZV8fn4+cSW6\nZRo2bKgkLyLF+JvorwtqFOIfX52/hxMfD5ddFpRQxLdQ3d24adMmLrroIh566CF69+4d+AOISMTw\ndz36McEORPzgK4ucdJLvKdeOPhr++U9o1SroYclfQpHo169fT1paGitWrODKK68kPj6e7t27B/Yg\nIhIxHB+MZ4zpBLyIZza90dbap8vY7lJgInCmtfarEIboDgcOlB5pbwx8/bVnGVpxhWBPY7BmzRrS\n0tL49ddfAbDWcuDAgcAdQEQijt/TZBlj/mGMGWyMKXPGbmNMQtE2A/zcZywwCugMnABcYYw5wcd2\nNYA7qNikPZHF1wTqRx2lJO8ywazof//9d9q3b+9N8nFxcUyYMIE+ffoE5gAiEpH8SvTGmHPxLFGb\naK3NK2s7a+1BIAH4tzHmbD92fRaw0lr7W9HPjge6+djuMWA4nlXxopNmvAsLwbq17pdffqFdu3as\nKjqTSEhIYNKkSVx66aVV37mIRDR/K/prgb14lqotzwhgD9Dfj22bAIeOU15X1OZljDkDaGatneFf\nqBFKc9iHhWBMlvPjjz/Svn171q1bB0BiYiJTpkzhkksuqdqORSQq+NtHfwGQZa3dW96G1tp9xpis\nop+pEmNMDPAc0M+PbW8CbgJISUmp6qHdRxW96+Xlwfr1pdur8jJ9//33pKens6VogfukpCSmTZtG\nenp65XcqIlHF34o+BfilAvtdWfQz5VkPHDqVSNOitj/VAE4CFhhjVgF/B6YaY9qW3JG19nVrbVtr\nbdv69etXINQwoYre9datg8LC4m0NGkBSUuX2t2vXrmJJvnr16syaNUtJXkQqxN9EH0vFJsCxfu77\nS6ClMeYYY0wC0AeY6t2JtbustfWstc2ttc2BxUDXqBx1r4re9QLdP1+rVi0ef/xxAGrUqMHcuXNp\n37595XcoIlHJ30v3fwDHVmC/xwJby9vIWptvjBkIzMFzMvGWtfYHY8yjwFfW2qmH30MUUUXvesHo\nn7/xxhsBOPXUUznrrLOqtjMRiUr+JvovgQ7GmFrW2l2H29AYUwvoAMw73HZ/stbOBGaWaBtaxrap\nfkUbaQ4e9D2BeiSORQhjgajorbWllpT9M9mLiFSGv5fu/wPUxHPPe3n+jadv/T+VDUpK8NX527Bh\n5Tt/JSiqWtFnZWWRnp7O7t27AxaTiIi/if6/wGd4JrRZaIzJKOpTB7wT5WQYYxYAVwKfWmv/G/hw\no5RT655KhVSld2XOnDl06dKF7OxsOnfuzN695d7gIiLiF3/nurdFU9DOwXPb3Bwg3xizrWiTukX7\nMsD/gF5BiDV6BaPzVwKusuMlp0+fzqWXXsrBgweL9rOaLVu2cMQRRwQ4QhGJRn5PgWut3QycAwzB\nM8lNPNCo6Cu+qO0h4Fxr7ZbAhxrFNBDP9QoKfC9RW16inzx5Mj179vQm+ZSUFBYuXEiLFi2CEKWI\nRKMKLWpjrc0BngCeMMY0BRoXPbXRWrsu0MFJEd1a53obNkB+fvG2OnWgRo2yf2bChAlcddVVFBQU\nANCiRQvmz5/P0XptRSSAKr16XVFiV3IPBVX0rlfRl+jdd9+lX79+FBYNsmzVqhVZWVk0bdo0KPGJ\nSPTyd1GbdsYYv+/lMsacYozpW/mwpBhV9K5XkZforbfe4tprr/Um+RNOOIEFCxYoyYtIUPjbR59N\nifnmjTH3HTIYr6QewNtViEv+lJ9fuc5fCSl/K/qJEydy/fXXY61nosmTTz6Z7OxsGjduXHpjEZEA\n8DfRGx9t1YDaAYxFfNmwwTPS61B164JGZLuKv3dApqenc/rppwNwxhlnkJ2dTYMGDYIbnIhENb9H\n3YtD1D8fFvy9A/LII49k7ty59OvXj6ysLOrWrRv02EQkulV6MJ6EiCbLCQsVeZnq1avH22+rZ0tE\nQkMVvdtpshzXKyz0nehTUiwPP/wwY8eODX1QIiJFVNG7nSp619u0ybPu0KFq1rQMHz6Y4cOHExMT\nQ0JCAn369HEmQBGJahWp6CuyHr0Eiip61yt9LmZJSLiL4cOHA1BYWMi4ceO8I+1FREKpIhX9MGPM\nsJKNxpgCH9tKoKiid73i52KFwG1s3fqyt6Vr16588MEHpZafFREJhYok+op+Sql8qaqyOn9V0bvK\nXy9RIXAzMNr73KWXXsq4ceNISEjw8ZMiIsHn7+p1GrTnhI0bIS+veFvt2lCrljPxiE+eir4A6A/8\nNfDuiiuuYOzYscTFaSiMiDhHn0Bupmo+LPz+ez7QF/iPty019VreffdNYmNjHYtLRASU6N0tTCfL\n2b8fvv229Ej0SGSt5dNPrwYmHNJ6A8888xqxsboQJiLOU6J3szCs6LOyoEcP2LPH6UhCxQBdgA/w\nDEv5B/ASxxyjJC8i7qBE72ZhVtFbC7feGk1J/k9XA3nA98AIqlc3aGZbEXELJXo3C7OKfutW+Pln\np6NwynXef7VqBbqTTkTcQtcX3SzMKnpf4UaevcCtwPYyt7juujKfEhEJOVX0bmVt2E2W4yvcevXg\nxBNDH0sw5Ofv5rvvMtm9+1Nq1FjCKafMIy7ur1sda9SA7t2hf38HgxQRKUGJ3q22bIEDB4q3HXEE\nHHmkM/H4wVdF36sXvPJKyEMJuB07dtCpUyd2714CwJ49XzFgwHSuuuoqhyMTETk8JXq3Kquad3Hn\nb5gNKfDbtm3b6NChA0uXLvW2vfjii0ryIhIWlOjdKgwXswmzIQV+2bJlCxkZGXz33XfetldffZWb\nb77ZwahERPynRO9WYdY/D2EZ8mFt3LiR9PR0fvzxRwCMMYwePZr+6oQXkTCiRO9WYVbRWxt2IR/W\n+vXrSUtLY8WKFQDExMQwZswYrr76aocjExGpGCV6twqz6+A7d5aeKCcxERo2dCaeqli7di2pqan8\n9ttvAMTGxvL+++9z+eWXOxyZiEjF6T56twqzkW2+zktSUiAmDN9htWrVon79+gDEx8fz4YcfKsmL\nSNgKw4/hKFDWdXAXV/SR1D9fs2ZNZs+ezbnnnsukSZPo0aOH0yGJiFSaLt270fbtsG9f8bakJCiq\nMt0okvrnAWrXrs0nbxysxwAAHRNJREFUn3yCcfHtjCIi/lBF70ZlZU0XJ51wrui/++473nnnnVLt\nSvIiEglU0btRmPXPQ/hW9EuXLqVDhw5s27YNgH79+jkbkIhIgKmid6Mw65+H8KzolyxZQlpamjfJ\n33XXXezYscPhqEREAkuJ3o3CMGuGW0X/2WefkZGRwc6dOwFPn/zcuXM50sVrCYiIVIYSvRuFWdbc\nvRtKFsJxcXDUUc7EU55FixbRsWNH9hTd+F+3bl2ys7Np27atw5GJiASeEr0bhVlF7yvcZs0gNjb0\nsZQnKyuLTp06sa/oroYGDRqwYMECTjvtNIcjExEJDiV6Nwqzij5czktmz55Nly5dyMnJAaBx48Ys\nXLiQk046yeHIRESCR4nebXbuhF27irclJECjRs7E44dwGDs4bdo0unXrxoEDBwBo2rQpCxcu5Pjj\nj3c4MhGR4FKidxtf5bHL55INh7sBc3NzKSgoAKB58+YsWrSIli1bOhyViEjwuTd7RKtwKI9LCIeQ\ne/XqxdixY2nVqhULFy7kmGOOcTokEZGQUKJ3m3Aoj0sIl5CvvPJKli1bRkpKitOhiIiEjBK924RD\neVyCG0P+4IMP+OOPP0q1JyYmOhCNiIhzlOjdJlzK4yL79kHJfBoTA02aOBMPwKhRo7j88svJyMjw\nznonIhKtlOjdxo3l8WGsWVO6rWlTiI8PfSwAzz//PAMHDgRg2bJlDBo0yJlARERcQonebcKsonfT\nLf/Dhw/nrrvu8j7++9//zr/+9S9nghERcQklejfZuxdKXmp281yyuGeynMcee4z777/f+/j8889n\nzpw51K5dO/TBiIi4iBK9m/jKmk2bepK9Szld0VtrGTJkCEOHDvW2XXjhhcyaNYuaNWuGLhAREZdy\nbwaJRmHWPw/OVvTWWu677z6effZZb1uHDh346KOPSE5ODk0QIiIup0Qfavv2wZIlULRyWjFz5pRu\nc2Gi37gRvvkGCgpg2bLSz4ciZGstd955Jy+++KK3LTMzk//+979Uq1Yt+AGIiIQJJfpQ+v576NAB\nNm3y/2dcNhDvzTfhppugsLDsbUIRcm5uLkuXLvU+7tatGxMmTNB98iIiJSjRh9LDD1csyYOrKvr9\n++Guuw6f5I3xLFEbbNWqVWP69OlcdNFFNG3alPfff594p+7pExFxMSX6UPrss4r/TOvWgY+jkpYv\nh927D7/NMcdAqIrqGjVqMGfOHJKSkohz8YBFEREnadR9qBw4UPFq/tRT4eyzgxNPJfgaeFfSzTcH\n59h5eXnMmjWrVHuNGjWU5EVEDkOfkKHiawq55GRITy/dbgycfLLnOrmLlqf1dVNAixZw4omeX6Vz\nZ+jbN/DHPXjwIFdccQWTJk3ilVde4ZZbbgn8QUREIpQSfaj4ypKnnw5Tp4Y8lMry9SvcdBPcd1/w\njpmbm0vv3r2ZNm0aAAMGDOCUU07h3HPPDd5BRUQiiBJ9qITZ1La+hPpXyMnJoUePHsw55LbDu+66\ni3POOSd4BxURiTDuuS4c6cJwMpySQvkr7Nu3jy5duhRL8oMHD2bEiBEYY4JzUBGRCKREHyphXtFb\nG7pfYc+ePXTu3Jn58+d724YNG8YTTzyhJC8iUkG6dB8qYV7R79xZ+ta6xERo2DCwx9m1axedO3fm\n888/97Y9+eSTDB48OLAHEhGJEkr0oRLmFb2v8FNSAntTwI4dO7jooov48ssvvW0jRozg7rvvDtxB\nRESijBJ9KBw8COvXl25PSQl9LJUUigsS33zzTbFpbUeOHMltt90W2IOIiEQZ9dGHwrp1nk7uQzVs\nCElJzsRTCaG4IJGens748eOJj4/ntddeU5IXEQkAVfShEOb98xC6X+HSSy/ll19+4egw6tYQEXEz\nVfSh4OSi7QESjF9h7dq1bNy4sVS7kryISOAo0YeCr3I4zJJZoH+FVatW0b59e9LT09myZUvldyQi\nIoelRB8KquiL+fXXX2nfvj2///47P/74I5mZmRQebu1bERGpNCX6UAjzin73bti+vXhbXBw0blzx\nff3888+0a9eONUWL/CQmJvLoo48S46LFe0REIonjn67GmE7GmJ+NMSuNMff7eP4uY8xyY8wyY0yW\nMSZ8MuSfwnwwXln30MfGVmw/y5cvp3379mzYsAGAatWqMXXqVDIzMwMQpYiI+OJoojfGxAKjgM7A\nCcAVxpgTSmy2FGhrrT0FmAg8E9ooqyg/33N7XUlhVNEH4ta6ZcuWkZqayubNmwFITk5m5syZdOzY\nMQARiohIWZyu6M8CVlprf7PWHgTGA90O3cBam22t3V/0cDHQNMQxVs369VBQULytXj34//buPjqq\n8trj+HdDSECpIIooYIBapNRai7KsWr3ACAURoSrVaFGkXPXSRRdq0bay2nr1Vmu5otel2MqtglhR\nUo3FokUUAr0WWCpUW99TUQKUSoFAFRFC9v1jDmGSTMIkc+b991krKzPPeebMnsfgnv2c55xz+OGZ\niacNkp2QWLt2LcOGDWPr1q0AdO7cmSVLljBs2LBQ4hMRkeZlOtH3Aqpjnm8M2pozGXgupRGFLccv\nfQvJfYQ1a9YQiUTYHhzk79KlC0uXLuXss88OMUIREWlOzlwwx8wmAIOBIc1svwa4BqA0my4tm+PH\n5yG5jzBz5kx27twJwJFHHsnSpUs57bTTQotNRERalumKfhNwfMzz3kFbA2Y2HJgBjHX3z+LtyN0f\ndPfB7j64e/fuKQm2TQq8on/kkUcYOnQoRx99NMuXL1eSFxFJs0xX9C8D/c2sH9EEXwZcHtvBzAYB\nvwJGuXvuXVmlwCv6ww47jGeeeYZNmzYxYMCAMMMSEZEEZLSid/daYCqwBHgLWOjub5jZrWY2Nug2\nE+gMlJvZn81sUYbCbZscr+h374ZgDV29du2gVzMrKd5///0mbZ07d1aSFxHJkExP3ePuz7r7ie5+\ngrv/LGj7ibsvCh4Pd/ce7v7V4Gdsy3vMMjle0cf7ntK7N3To0LT9d7/7HQMHDuTuu+9OfWAiIpKQ\njCf6vFZXB8EV4BrIoYo+0UvflpeXM378ePbu3csNN9zA3LlzUx2aiIgkQIk+lf7+d9i3r2Fb167Q\npUtm4mmDRK7e+9hjj1FWVkZtbS0AJ5xwApFIJPXBiYjIISnRp1IB3Mxm3rx5TJgwof6mNAMGDGDl\nypXZdYqjiEgBU6JPpRy/mQ20/BHmzJnDpEmTcHcATjrpJFasWEHPnj3TF6CIiLRIiT6VcnwhHjT/\nEe6//36uueaa+iR/yimnsHz5cnr06JHW+EREpGVK9KmU46fWQfyP8OKLs5g6dWr988GDB7Ns2TKy\n6kJFIiICKNGnVo5X9Hv2RNcTNvQBs2bdXP/sjDPO4IUXXqBbt25pjU1ERBKjRJ9KOV7RV1c3bevZ\nsy9PPfUUHTp04JxzzuH555+nSw6dRSAiUmgyfQnc/OWe84m+uYV4o0ePZunSpQwePJjDc+h2uyIi\nhUiJPlU++ig69x2rc2fIoSnu6PcUB3YA0bgPHHkYMiTuTQRFRCTLaOo+VZo7Pm+W7kjabP16B64D\nTgc2Azm1xEBERFCiT50cn7avq6ujvHwKcC/wNyAC/DOXPoKIiKCp+9TJ4RX3+/fv5+qrr+a99x6O\naT0F6JIrH0FERAJK9KmSoxV9bW0tkyZN4tFHH41pnQA8DBQp0YuI5Bgl+lTJwYp+3759TJgwgYUL\nF8a0TgLmAO0B0CXsRURyixJ9quRYRb93717KysqoqKiIab0WmM2BpRw9ekCnTpmITkRE2kqL8VLB\nPacq+j179nDRRRc1SPIXXvg94AFi/0Sy+HuKiIg0Q4k+FbZtg08+adjWqRNk6bXgZ86cyeLFi+uf\nT58+nTFj/gdoeCpgln5PERGRFijRp0Jz0/ZZeg79jTfeyMiRIwG4+eab+cUvfsGGDU1jVUUvIpJ7\ndIw+FXLsPvQdO3akoqKC8vJyrrjiCswsl448iIhIC1TRp0K8ij6LsuSexpfmBTp16sSVV16JBbMO\nObaWUEREmqFEnwpZXNFv376ds88+m9tvv73FfqroRUTyg6buUyFLK/qtW7cyYsQIXnvtNV599VWK\ni4uZPn16k361tbBxY9PXZ8l3FRERaQUl+lTIwop+y5YtDB8+nDfeeAMAM6Nr165x+27eHE32sY46\nKnrzPRERyS1K9KmQZRX95s2biUQivPPOOwC0a9eOhx56iIkTJ8btr+PzIiL5Q4k+bDU1sHNnw7bi\nYjj22IyEU11dTSQSoaqqCoD27dszf/58LrvssmZfo+PzIiL5Q4k+bPHK4dJSaJf+dY/r168nEonw\nQZC5i4qKWLBgAePHj2/xdVk2ISEiIklQog9blhyfr6qqIhKJUF1dDUCHDh0oLy9n3Lhxh3xtlnwE\nEREJgRJ92LKgHK6treW8886rT/IlJSVUVFRw3nnnJfR6Td2LiOQPnUcftiwoh4uKinjggQcoKSmh\nU6dO/P73v084yYMW44mI5BNV9GHLgooeYPjw4Tz99NN07NiRoUOHJvy6ujrYsKFpuxK9iEhuUqIP\nW4bmvevq6mjXaMHfqFGjWr2fLVtg796GbV26QDOn3IuISJbT1H3YMjB1v3r1agYNGsSH8WYTWknH\n50VE8osSfZj+9S/Yvr1hW1ER9OyZsrf84x//yIgRI3j99deJRCJsjHft2lbQ8XkRkfyiRB+meFmy\nd+9osk+BZcuWMWrUKD7++GMAdu3axY4dO5Lapyp6EZH8okQfpjQuxHv++ec5//zz2b17NwA9evSg\nsrKSk08+Oan9qqIXEckvSvRhStPx+cWLF3PBBRfU31e+Z8+erFixgpNOOinpfauiFxHJL0r0YUpD\nRV9RUcGFF17I3mBpfGlpKStXrmTAgAGh7F8VvYhIftHpdWFKcUW/cOFCLr/8cvbv3w9Av379WL58\nOX1a8R61tVBZCe++G3+7KnoRkfyiRB+mFFb0lZWVXHbZZdTV1QHQv39/li1bRu/evRPeR10dfPOb\nsHhx4u/buTN069baaEVEJFto6j5MKazozzrrLMaMGQPAwIEDWbFiRauSPMDKla1L8hAN36x1rxER\nkeyhRB+WTz+Fjz5q2GYWPb0uBMXFxSxcuJBp06ZRWVnJcccd1+p9vPpq6983hPV9IiKSQZq6D0u8\naftevaC4OLS3KCkp4Z577mnz6+NNOLSkuBimTm3z24mISBZQog9LyMfn77rrLmpqarjtttvaHlMj\n8UIcNQr69Wva3q0bXHwxDBoU2tuLiEgGKNGHJcTj87fffjszZswAolP2P/7xj5MI7KB4If70p3DG\nGaHsXkREspCO0YclhPPS3J1bbrmlPskDvPDCC/XnzCfDXafOiYgUIlX0YUnySjPuzowZM7jjjjvq\n2yKRCIsWLaI4hOP8NTXRe+7EKimBY45JetciIpLFlOjDkkS57O5Mnz6dWbNm1beNHDmSiooKOnXq\nlLLw+vSBdprTERHJa0r0YWljRV9XV8e0adO477776tvGjBlDeXk5HTt2zHR4IiKS45Tow/DZZ7B5\nc9P20tIWX1ZXV8eUKVN48MEH69suuugiFixYEMp0fSwdnxcRKUyauA1DdXXTtmOPhUNU5NOmTWuQ\n5MvKynj88cdDT/Kgil5EpFAp0YehjeXyJZdcwmGHHQbAFVdcwfz58+nQoUO4sQVU0YuIFCZN3Yeh\njRfLOeecc3jmmWd48sknuffee2nfvn34sQVU0YuIFCYl+jAkcbGcSCRCJBIJN544VNGLiBQmTd2H\nIYGKfs+ePXznO9+hqqoqPTHF2LULduxo2FZUBG24L46IiOQYJfowHKKi3717N2PHjuXhhx8mEonw\nQWvvLpOkeN9DSkshhUcKREQkSyjRh6GFefGPP/6Y888/n6VLlwJQXV1NRUVF+mIj1Mvwi4hIjtEx\n+mTt2webNjVtLy1l165djB49mpdeeqm++dZbb+X6669PY4Ch31hPRERyiBJ9sjZuhLq6hm3du1Oz\nbx+jRo1izZo19c133nknN910U5oDVEUvIlLIlOiTFadc3tarF98491zWrl1b33b33Xdz3XXXpTOy\neqroRUQKlxJ9shqVyx8BI9av5/WdO+vbZs+ezZQpU9IbVwxV9CIihUuJPlkx5fI2YBjwZpDkzYw5\nc+YwefLkzMQWUEUvIlK4tOo+WTHlchfgpOBxu3btmDdvXsaT/CefwNatDdvatYNevTITj4iIpJcq\n+mTFlMtFwG8Avv51Lpo6lbKyskxFVW/DhqZtvXtDii6pLyIiWUaJPlmNDoB3AJ6YPRv7ylcyEk5j\nOj4vIlLYNHWfhPfefpufffgh3qjdsugAuI7Pi4gUNlX0bfTWW28RGTKELXV1fAL8DDCAI4+EI47I\nbHAxVNGLiBQ2VfRt8Je//IUhQ4awJVjldg+w/sDGLCuXddc6EZHCpkTfSuvWrWPYsGFsDZL84cBz\nwOcPdMiycln3oRcRKWxK9K3w8ssvE4lE2LZtGwBHlJTwPDAktlOWlcuq6EVEClvGE72ZjTKzd8ys\nysx+GGd7iZk9EWxfY2Z90x8lrFq1iuHDh1NTUwNA165dWTpyJGc17phF5fKePbBlS9P2449Pfywi\nIpIZGV2MZ2btgfuBEcBG4GUzW+Tub8Z0mwzscPcvmFkZcCdwaapj2/lhDeuu/SUAr21fz4/WzuXT\n/XsBOKJDJ+4cOIETVi1t8ronX+3Lez9PdXSJCb6TNNCzJ5SUpD8WERHJjEyvuj8dqHL39wHM7HFg\nHBCb6McBtwSPfwvcZ2bm7o3PagvVzvXbGbrkRywDbgY+Ddq7Ay/u+5STV90X93X/9Wgf/pzKwJKU\nRRMOIiKSBpmeuu8FVMc83xi0xe3j7rXATuCotEQHdAbaB4+PBSqBk1vo/wF9UxxRcnR8XkSksGQ6\n0YfGzK4xs1fM7JWtjS/unoTTgT8AXwRWAF9qoe9GelFD19DeOxVObulbioiI5J1MJ/pNQOzSsN5B\nW9w+ZlZE9N4x2xrvyN0fdPfB7j64e/fuoQZ5FvBX4MRD9Ludmwkum5OVevaEiRMzHYWIiKRTpo/R\nvwz0N7N+RBN6GXB5oz6LgInAKmA8sCzVx+cBOvfqQuXXfpBQ39qijvytdBhHlA4hsVekX69eMH48\nHHdcpiMREZF0ymiid/daM5sKLCF6KPwhd3/DzG4FXnH3RcCvgflmVgVsJ/plIOW69T+KoasTXz4/\nPIWxiIiItFWmK3rc/Vng2UZtP4l5vAf4VrrjEhERyQeZPkYvIiIiKaRELyIikseU6EVERPKYEr2I\niEgeU6IXERHJY0r0IiIieUyJXkREJI8p0YuIiOQxJXoREZE8pkQvIiKSx5ToRURE8pgSvYiISB5T\nohcREcljSvQiIiJ5TIleREQkj5m7ZzqG0JnZVuDDEHd5NPDPEPdXqDSOydMYJk9jmDyNYfLCHsM+\n7t493oa8TPRhM7NX3H1wpuPIdRrH5GkMk6cxTJ7GMHnpHENN3YuIiOQxJXoREZE8pkSfmAczHUCe\n0DgmT2OYPI1h8jSGyUvbGOoYvYiISB5TRS8iIpLHlOhjmNkoM3vHzKrM7IdxtpeY2RPB9jVm1jf9\nUWa3BMbwBjN708xeN7MXzaxPJuLMZocaw5h+F5uZm5lWP8eRyDia2SXB3+MbZvZYumPMdgn8ey41\ns+Vmti74Nz06E3FmKzN7yMw+MrO/NrPdzOzeYHxfN7NTUxKIu+sneviiPfA34PNAMfAa8KVGfb4L\n/DJ4XAY8kem4s+knwTEcBhwWPJ6iMWz9GAb9PgesBFYDgzMdd7b9JPi32B9YBxwZPD8m03Fn00+C\nY/ggMCV4/CXgg0zHnU0/wL8BpwJ/bWb7aOA5wIAzgDWpiEMV/UGnA1Xu/r677wUeB8Y16jMOmBc8\n/i1wrplZGmPMdoccQ3df7u67g6ergd5pjjHbJfJ3CHAbcCewJ53B5ZBExvFq4H533wHg7h+lOcZs\nl8gYOnBE8LgLsDmN8WU9d18JbG+hyzjgEY9aDXQ1s+PCjkOJ/qBeQHXM841BW9w+7l4L7ASOSkt0\nuSGRMYw1mei3WTnokGMYTO8d7+6L0xlYjknkb/FE4EQze8nMVpvZqLRFlxsSGcNbgAlmthF4Fvhe\nekLLG639f2abFIW9Q5FEmNkEYDAwJNOx5BIzawfMAq7KcCj5oIjo9P1QojNLK83sZHevyWhUueUy\nYK6732VmZwLzzezL7l6X6cDkIFX0B20Cjo953jtoi9vHzIqITlVtS0t0uSGRMcTMhgMzgLHu/lma\nYssVhxrDzwFfBirN7AOix/UWaUFeE4n8LW4EFrn7PndfD7xLNPFLVCJjOBlYCODuq4CORK/hLolJ\n6P+ZyVKiP+hloL+Z9TOzYqKL7RY16rMImBg8Hg8s82BFhQAJjKGZDQJ+RTTJ65hoUy2OobvvdPej\n3b2vu/clus5hrLu/kplws1Yi/56fJlrNY2ZHE53Kfz+dQWa5RMZwA3AugJkNJJrot6Y1yty2CLgy\nWH1/BrDT3f8e9pto6j7g7rVmNhVYQnS16UPu/oaZ3Qq84u6LgF8TnZqqIrrAoixzEWefBMdwJtAZ\nKA/WMW5w97EZCzrLJDiGcggJjuMS4Btm9iawH7jR3TVDF0hwDL8PzDGz64kuzLtKxc9BZraA6JfJ\no4N1DD8FOgC4+y+JrmsYDVQBu4FJKYlD/01ERETyl6buRURE8pgSvYiISB5TohcREcljSvQiIiJ5\nTIleREQkjynRi0iLzOyq4C55V2U6FhFpPSV6EclKZlZpZjr/VyRJumCOiBxKBdEr8IV+xS4RST0l\nehFpkbvvJHqnRhHJQZq6FykwZtY3OOY+18y+aGZPm9l2M/vEzP7PzL7RqH+DY/Rm1tHMaszso+Dm\nTvHe44HgNWMatZ9rZn8I3u8zM3vXzH5uZl0ax0dwZ8NgPwd+KkMeDpG8p0QvUrj6AauAbkRvNFQO\nnAY8Z2aXNvcid98DPAF0B85rvN3MSoBLgX8Af4hpvxZYCnyd6A1l7iZ6z4gfAH8ys65B1xrgP4EP\ng+f/GfMzt02fVKSA6Vr3IgXGzPoC64On/+3uN8ZsG0w0+X8M9HH3XUEl/zAwyd3nBv3OBP4EPOnu\n4xvt/1tEb106y92/H7T1IXob2M+A09397Zj+s4EpwBx3vyamvRIY4u4W1mcXKUSq6EUK107g1tiG\n4Ha3vwG6Ahc298Lg3uPvAheYWbdGmw/cynleTNsEoBi4LzbJB2YA/wKuCGYDRCRESvQihWutu/8r\nTntl8HvQIV4/j2jyrr9ds5n1AEYC69z99Zi+pwa/lzXeibvvANYRvZf5FxOKXEQSpkQvUrj+0Uz7\nluB3l2a2H/AIUMfBCh7g20TP5pnXqO+BfTV3it6B9q7NbBeRNlKiFylcPZppPzb43eIpde6+kWiF\nfrqZHajEJwL7gMcadT+wr2OJ77hE3lNEWk+JXqRwnWpmn4vTPjT4vS6BfcwNfk80s68CXwGec/et\njfod2NfQRu0Eq+2/CuwB3orZtD/Y3j6BOESkGUr0IoWrC/CT2IZg1f23iVbWFQns4ylgF9HFdlcF\nbXPj9HuUaKX/PTP7QqNttwFHAI+6+2cx7duC36UJxCEizdCV8UQK10rg383sa8BLRKfPLyVaAFzr\n7rsOtQN3/9TMyoHJwHeJJufFcfp9YGbXAfcDa81sIbCV6EVxzgTeJno+fawXgW8BT5nZs8CnwIfu\nPr8tH1akUKmiFylc64GzgB3AfwCXAGuB0e7+RCv2Mzf43QFY4O5743Vy99lEV+SvBi4GbgCOAWYC\nZ7r79kYv+V/gDqIzDzcRrfwntyIuEUEXzBEpODEXzJnn7ldlNBgRSTlV9CIiInlMiV5ERCSPKdGL\niIjkMR2jFxERyWOq6EVERPKYEr2IiEgeU6IXERHJY0r0IiIieUyJXkREJI8p0YuIiOSx/wdSwEuL\nP6kZDwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(8, 8))\n", + "results = pd.concat(dfs)\n", + "pivot_plot(results, fig=fig);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-slideshow", + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/learning/Learning1.Rmd b/doc/source/learning/Learning1.Rmd deleted file mode 100644 index 359cbe982..000000000 --- a/doc/source/learning/Learning1.Rmd +++ /dev/null @@ -1,26 +0,0 @@ ---- -jupyter: - jupytext: - cell_metadata_filter: all,-slideshow - formats: ipynb,Rmd - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.1' - jupytext_version: 1.1.1 - kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -# Learning 1 - -```{python} -import numpy as np -print('notebook 1') -``` - -```{python collapsed=TRUE} - -``` diff --git a/doc/source/learning/Learning1.ipynb b/doc/source/learning/Learning1.ipynb deleted file mode 100644 index 6ead7af9e..000000000 --- a/doc/source/learning/Learning1.ipynb +++ /dev/null @@ -1,63 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Learning 1" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "notebook 1\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "print('notebook 1')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "all,-slideshow", - "formats": "ipynb,Rmd" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/doc/source/learning/Learning2.Rmd b/doc/source/learning/Learning2.Rmd deleted file mode 100644 index aca1f8f4d..000000000 --- a/doc/source/learning/Learning2.Rmd +++ /dev/null @@ -1,26 +0,0 @@ ---- -jupyter: - jupytext: - cell_metadata_filter: all,-slideshow - formats: ipynb,Rmd - text_representation: - extension: .Rmd - format_name: rmarkdown - format_version: '1.1' - jupytext_version: 1.1.1 - kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -# Learning 2 - -```{python} -import numpy as np -print('notebook 2') -``` - -```{python collapsed=TRUE} - -``` diff --git a/doc/source/learning/Learning2.ipynb b/doc/source/learning/Learning2.ipynb deleted file mode 100644 index 66c0d95dc..000000000 --- a/doc/source/learning/Learning2.ipynb +++ /dev/null @@ -1,63 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Learning 2" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "notebook 2\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "print('notebook 2')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "cell_metadata_filter": "all,-slideshow", - "formats": "ipynb,Rmd" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/doc/source/learning/index.rst b/doc/source/learning/index.rst index 8a74213aa..7ee107c8a 100644 --- a/doc/source/learning/index.rst +++ b/doc/source/learning/index.rst @@ -8,5 +8,5 @@ case considered above. .. toctree:: :maxdepth: 2 - Learning1.ipynb - Learning2.ipynb \ No newline at end of file + Basic_example.ipynb + Full_model_LASSO.ipynb \ No newline at end of file diff --git a/selectinf/learning/core.py b/selectinf/learning/core.py index 7ad6b4b68..a293dc9fd 100644 --- a/selectinf/learning/core.py +++ b/selectinf/learning/core.py @@ -364,22 +364,8 @@ def _inference(observed_target, else: weight_val = np.squeeze(weight_fn(target_val)) - if DEBUG: - import matplotlib.pyplot as plt, uuid - plt.plot(target_val, weight_val) - id_ = 'inference_' + str(uuid.uuid1()) - plt.savefig(id_+'_prob.png') - plt.clf() - weight_val *= ndist.pdf((target_val - observed_target) / target_sd) - plt.plot(target_val, weight_val) - plt.plot(target_val, ndist.pdf((target_val - observed_target) / target_sd), label='gaussian') - plt.plot([hypothesis], [0], '+', color='orange') - plt.legend() - plt.savefig(id_+'_dens.png') - plt.clf() - exp_family = discrete_family(target_val, weight_val) pivot = exp_family.cdf((hypothesis - observed_target) @@ -474,7 +460,13 @@ def repeat_selection(base_algorithm, sampler, min_success, num_tries): return set(final_value) -def cross_inference(learning_data, nuisance, direction, fit_probability, nref=200, fit_args={}): +def cross_inference(learning_data, + nuisance, + direction, + fit_probability, + nref=200, + fit_args={}, + verbose=False): T, Y = learning_data @@ -514,7 +506,8 @@ def new_weight_fn(nuisance, direction, weight_fn, target_val): weight_val = new_weight_fn(d_T) exp_family = discrete_family(d_T, weight_val) - print(ref_Y) + if verbose: + print(ref_Y) pval = [exp_family.cdf(0, x=t) for t, y in zip(ref_T, ref_Y) if y == 1] pvalues.append(pval) diff --git a/selectinf/learning/learners.py b/selectinf/learning/learners.py index c34a80d5a..717ab1e08 100644 --- a/selectinf/learning/learners.py +++ b/selectinf/learning/learners.py @@ -191,7 +191,8 @@ def learn(self, fit_probability, fit_args = {}, B=500, - check_selection=None): + check_selection=None, + verbose=False): """ fit_probability : callable @@ -206,11 +207,14 @@ def learn(self, check_selection : callable (optional) Callable that determines selection variable. + verbose : bool + Print out probability of selection? """ learning_selection, learning_T, random_algorithm = self.generate_data(B=B, check_selection=check_selection) - print('prob(select): ', np.mean(learning_selection, 0)) + if verbose: + print('prob(select): ', np.mean(learning_selection, 0)) conditional_laws = fit_probability(learning_T, learning_selection, **fit_args) return conditional_laws, (learning_T, learning_selection) diff --git a/selectinf/learning/utils.py b/selectinf/learning/utils.py index a590a418b..4eeb77b77 100644 --- a/selectinf/learning/utils.py +++ b/selectinf/learning/utils.py @@ -441,14 +441,16 @@ def lee_inference(X, try: import matplotlib.pyplot as plt - def pivot_plot(df, - outbase, - figsize=(8,8)): + def pivot_plot_old(df, + outbase=None, + figsize=(8,8), + verbose=False): - print("selective:", np.mean(df['pivot']), np.std(df['pivot']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage'])) - print("naive:", np.mean(df['naive_pivot']), np.std(df['naive_pivot']), np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage'])) + if verbose: + print("selective:", np.mean(df['pivot']), np.std(df['pivot']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage'])) + print("naive:", np.mean(df['naive_pivot']), np.std(df['naive_pivot']), np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage'])) - print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length']))) + print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length']))) f = plt.figure(num=1, figsize=figsize) plt.clf() @@ -457,7 +459,8 @@ def pivot_plot(df, plt.plot(U, sm.distributions.ECDF(df['naive_pivot'])(U), 'r', label='Naive', linewidth=3) plt.legend(fontsize=15) plt.plot([0,1], [0,1], 'k--', linewidth=2) - plt.savefig(outbase + '.pdf') + if outbase is not None: + plt.savefig(outbase + '.pdf') pivot_ax = plt.gca() pivot_ax.set_ylabel(r'P(pivot < t)') pivot_ax.set_xlabel(r't') @@ -514,25 +517,27 @@ def liu_inference(X, import statsmodels.api as sm def pvalue_plot(df, - outbase, + outbase=None, figsize=(8, 8), naive=True, split=False, - bonferroni=False): + bonferroni=False, + verbose=False): - print("selective:", np.mean(df['pvalue']), np.std(df['pvalue']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage'])) + if verbose: + print("selective:", np.mean(df['pvalue']), np.std(df['pvalue']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage'])) - if naive: - print("naive:", np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage'])) - print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length']))) + if naive: + print("naive:", np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage'])) + print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length']))) - if split: - print("split:", np.mean(df['split_length']), np.std(df['split_length']), np.mean(df['split_coverage'])) - print("len ratio selective divided by split:", np.mean(np.array(df['length']) / np.array(df['split_length']))) + if split: + print("split:", np.mean(df['split_length']), np.std(df['split_length']), np.mean(df['split_coverage'])) + print("len ratio selective divided by split:", np.mean(np.array(df['length']) / np.array(df['split_length']))) - if bonferroni: - print("bonferroni:", np.mean(df['bonferroni_length']), np.std(df['bonferroni_length']), np.mean(df['bonferroni_coverage'])) - print("len ratio selective divided by bonferroni:", np.mean(np.array(df['length']) / np.array(df['bonferroni_length']))) + if bonferroni: + print("bonferroni:", np.mean(df['bonferroni_length']), np.std(df['bonferroni_length']), np.mean(df['bonferroni_coverage'])) + print("len ratio selective divided by bonferroni:", np.mean(np.array(df['length']) / np.array(df['bonferroni_length']))) f = plt.figure(figsize=figsize) plt.clf() @@ -569,21 +574,29 @@ def pvalue_plot(df, pvalue_ax.set_ylabel(r'ECDF(pvalue)', fontsize=20) pvalue_ax.set_xlabel(r'pvalue', fontsize=20) - plt.savefig(outbase + '_pvalues.pdf') - plt.savefig(outbase + '_pvalues.png', dpi=300) + if outbase is not None: + plt.savefig(outbase + '_pvalues.pdf') + plt.savefig(outbase + '_pvalues.png', dpi=300) return pvalue_ax - def pivot_plot_new(df, - outbase, - palette = {'Learned': 'b', - 'Naive': 'r', - 'Bonferroni': 'gray', - 'Lee':'gray', - 'Strawman':'gray'}, - figsize=(8, 8), straw=False): - - f = plt.figure(figsize=figsize) + def pivot_plot(df, + outbase=None, + palette = {'Learned': 'b', + 'Naive': 'r', + 'Bonferroni': 'gray', + 'Lee':'gray', + 'Strawman':'gray'}, + fig=None, + figsize=(8, 8), + straw=False, + verbose=False): + + if fig is None: + f = plt.figure(figsize=figsize) + else: + f = fig + f.clf() new_df = pd.DataFrame({'Learned': df['pivot'], 'Naive': df['naive_pivot']}) if straw: @@ -598,8 +611,11 @@ def pivot_plot_new(df, ax.set_ylabel('ECDF(pivot)', fontsize=20) ax.legend(fontsize=15) - pngfile = outbase + '_pivot.png' - plt.savefig(pngfile, dpi=300) + if outbase is not None: + pngfile = outbase + '_pivot.png' + plt.savefig(pngfile, dpi=300) + else: + pngfile = None return ax, f, pngfile, df, new_df From 7ce3837c7ff1abd1d3cd63bafb80ec66cba119a8 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 24 Sep 2019 23:43:00 -0700 Subject: [PATCH 002/187] trying to fix .travis.yml switching when we change directory fixing path needed rtd theme need pandoc removing print statement --- .travis.yml | 27 +++++++++++++++------------ doc-requirements.txt | 1 + selectinf/learning/fitters.py | 2 -- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 881190701..7b9c78817 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ dist: trusty python: - 2.7 - 3.5 + - 3.6 notifications: email: false addons: @@ -71,7 +72,6 @@ matrix: - DEPENDS= - python: 3.6 sudo: true - dist: trusty env: - DOC_BUILD=1 @@ -90,14 +90,6 @@ before_install: install: # Install selectinf - - | - echo "backend : agg" > matplotlibrc - if [ "$DOC_BUILD" ]; then # doc build - pip install -r doc-requirements.txt - cd doc - jupytext --sync source/*/*.ipynb - # Build without the API documentation, for the doctests - make html - if [ "$RUN_R_TESTS" ]; then sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; @@ -121,11 +113,22 @@ script: # No figure windows for mpl; quote to hide : from travis-ci yaml parsing - pip install -r requirements.txt -c constraints.txt; # older rpy2 # Change into an innocuous directory and find tests from installation - - mkdir for_testing - - cd for_testing - 'echo "backend : agg" > matplotlibrc' - + - | + if [ "$DOC_BUILD" ]; then + pip install -r doc-requirements.txt; + cd doc; + jupytext --sync source/*/*.ipynb; + sudo apt-get install pandoc; + make html; + fi + # + # # Build the htmlwithout the API documentation, for the doctests + # + # fi # Doctests only on platforms that have compatible fp output + - mkdir for_testing + - cd for_testing - if [ `uname` == "Darwin" ] || [ "${TRAVIS_PYTHON_VERSION:0:1}" == "3" ]; then DOCTEST_ARGS="--with-doctest"; diff --git a/doc-requirements.txt b/doc-requirements.txt index 37dc7d0d8..ab7ed399c 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -12,3 +12,4 @@ tensorflow keras nbsphinx jupytext +sphinx_rtd_theme diff --git a/selectinf/learning/fitters.py b/selectinf/learning/fitters.py index 525179102..c6edb396c 100644 --- a/selectinf/learning/fitters.py +++ b/selectinf/learning/fitters.py @@ -8,7 +8,6 @@ def gbm_fit_sk(T, Y, **params): fitfns = [] for j in range(Y.shape[1]): - print('variable %d' % (j+1,)) y = Y[:,j].astype(np.int) clf = ensemble.GradientBoostingClassifier(**params) clf.fit(T, y) @@ -24,7 +23,6 @@ def random_forest_fit_sk(T, Y, **params): fitfns = [] for j in range(Y.shape[1]): - print('variable %d' % (j+1,)) y = Y[:,j].astype(np.int) clf = ensemble.RandomForestClassifier(**params) clf.fit(T, y) From 3ea185bb728675a6d8ec653684782a66c373747c Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 25 Sep 2019 01:03:21 -0700 Subject: [PATCH 003/187] full model simulation --- doc/source/learning/Full_model_LASSO.Rmd | 32 ++++------ doc/source/learning/Full_model_LASSO.ipynb | 74 ++++++---------------- 2 files changed, 35 insertions(+), 71 deletions(-) diff --git a/doc/source/learning/Full_model_LASSO.Rmd b/doc/source/learning/Full_model_LASSO.Rmd index 31c9d66a2..bbbe6bf63 100644 --- a/doc/source/learning/Full_model_LASSO.Rmd +++ b/doc/source/learning/Full_model_LASSO.Rmd @@ -36,7 +36,7 @@ from selectinf.tests.instance import gaussian_instance # to generate the data from selectinf.learning.core import normal_sampler # our representation of the (limiting) Gaussian data from selectinf.learning.utils import full_model_inference, pivot_plot -from selectinf.learning.Rfitters import logit_fit +from selectinf.learning.fitters import gbm_fit_sk ``` We will know generate some data from an OLS regression model and fit the LASSO @@ -45,16 +45,16 @@ true parameters, hence we can then return pivots for each variable selected by the LASSO. These pivots should look (marginally) like a draw from `np.random.sample`. This is the plot below. -```{python} +```{python collapsed=TRUE} np.random.seed(0) # for replicability -def simulate(n=100, +def simulate(n=200, p=20, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, - B=4000, + B=6000, verbose=False): # description of statistical problem @@ -69,7 +69,11 @@ def simulate(n=100, random_signs=True, scale=False)[:3] - dispersion = sigma**2 + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) @@ -89,16 +93,12 @@ def simulate(n=100, noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) - soln = problem.solve(max_its=50, tol=1.e-6) + soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) - XTX = X.T.dot(X) - XTXi = np.linalg.inv(XTX) - resid = y - X.dot(XTXi.dot(X.T.dot(y))) - dispersion = np.linalg.norm(resid)**2 / (n-p) - + lam = 3.5 * np.sqrt(n) selection_algorithm = functools.partial(base_algorithm, XTX, lam) if verbose: @@ -112,8 +112,8 @@ def simulate(n=100, sampler, success_params=(1, 1), B=B, - fit_probability=logit_fit, - fit_args={'df':20}) + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':500}) ``` Let's take a look at what we get as a return value: @@ -128,7 +128,7 @@ df.columns ```{python} dfs = [] -for i in range(10): +for i in range(30): df = simulate() if df is not None: dfs.append(df) @@ -139,7 +139,3 @@ fig = plt.figure(figsize=(8, 8)) results = pd.concat(dfs) pivot_plot(results, fig=fig); ``` - -```{python collapsed=TRUE} - -``` diff --git a/doc/source/learning/Full_model_LASSO.ipynb b/doc/source/learning/Full_model_LASSO.ipynb index 49845025b..fbceea950 100644 --- a/doc/source/learning/Full_model_LASSO.ipynb +++ b/doc/source/learning/Full_model_LASSO.ipynb @@ -23,29 +23,7 @@ { "name": "stderr", "output_type": "stream", - "text": [ - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", - " from numpy.core.umath_tests import inner1d\n", - "Using TensorFlow backend.\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:455: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:456: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:457: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:458: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:459: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", - "/Users/jonathantaylor/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:462: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", - "R[write to console]: Loaded gbm 2.1.5\n", - "\n", - "R[write to console]: randomForest 4.6-14\n", - "\n", - "R[write to console]: Type rfNews() to see new features/changes/bug fixes.\n", - "\n" - ] + "text": [] } ], "source": [ @@ -60,7 +38,7 @@ "from selectinf.learning.core import normal_sampler # our representation of the (limiting) Gaussian data\n", "\n", "from selectinf.learning.utils import full_model_inference, pivot_plot\n", - "from selectinf.learning.Rfitters import logit_fit" + "from selectinf.learning.fitters import gbm_fit_sk" ] }, { @@ -77,18 +55,20 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "np.random.seed(0) # for replicability\n", "\n", - "def simulate(n=100, \n", + "def simulate(n=200, \n", " p=20, \n", " s=5, \n", " signal=(0.5, 1), \n", " sigma=2, \n", " alpha=0.1, \n", - " B=4000,\n", + " B=6000,\n", " verbose=False):\n", "\n", " # description of statistical problem\n", @@ -103,7 +83,11 @@ " random_signs=True,\n", " scale=False)[:3]\n", "\n", - " dispersion = sigma**2\n", + "\n", + " XTX = X.T.dot(X)\n", + " XTXi = np.linalg.inv(XTX)\n", + " resid = y - X.dot(XTXi.dot(X.T.dot(y)))\n", + " dispersion = np.linalg.norm(resid)**2 / (n-p)\n", "\n", " S = X.T.dot(y)\n", " covS = dispersion * X.T.dot(X)\n", @@ -123,16 +107,12 @@ " noisy_S = sampler(scale=scale)\n", " loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0)\n", " problem = rr.simple_problem(loss, pen)\n", - " soln = problem.solve(max_its=50, tol=1.e-6)\n", + " soln = problem.solve(max_its=100, tol=1.e-10)\n", " success += soln != 0\n", " \n", " return set(np.nonzero(success)[0])\n", "\n", - " XTX = X.T.dot(X)\n", - " XTXi = np.linalg.inv(XTX)\n", - " resid = y - X.dot(XTXi.dot(X.T.dot(y)))\n", - " dispersion = np.linalg.norm(resid)**2 / (n-p)\n", - " \n", + " \n", " lam = 3.5 * np.sqrt(n)\n", " selection_algorithm = functools.partial(base_algorithm, XTX, lam)\n", " if verbose:\n", @@ -146,8 +126,8 @@ " sampler,\n", " success_params=(1, 1),\n", " B=B,\n", - " fit_probability=logit_fit,\n", - " fit_args={'df':20})" + " fit_probability=gbm_fit_sk,\n", + " fit_args={'n_estimators':500})" ] }, { @@ -166,7 +146,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{18, 13, 14}\n" + "{19}\n" ] }, { @@ -200,17 +180,14 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", - "text": [ - "/Users/jonathantaylor/git-repos/selectinf/selectinf/distributions/discrete_family.py:86: RuntimeWarning: divide by zero encountered in log\n", - " self._lw = np.array([np.log(v) for v in xw[:,1]])\n" - ] + "text": [] } ], "source": [ "dfs = []\n", - "for i in range(10):\n", + "for i in range(30):\n", " df = simulate()\n", " if df is not None:\n", " dfs.append(df)" @@ -223,7 +200,7 @@ "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfoAAAHpCAYAAABqV/58AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3xUVfrH8c9JJUGKdAQiooBrV1DX\nBjEJCBFpgmJDxMqKujYUFcQuylpwsWIBlQVlQXqREOBnQSy4qKiISm/SWwgp5/fHxJEkEzJJZube\nmfm+X6+8wpy5ufcJM5nnPvece46x1iIiIiKRKcbpAERERCR4lOhFREQimBK9iIhIBFOiFxERiWBK\n9CIiIhFMiV5ERCSCxTkdQDDUq1fPNm/e3OkwREREQuLrr7/eaq2t7+u5iEz0zZs356uvvnI6DBER\nkZAwxqwu6zlduhcREYlgSvQiIiIRTIleREQkginRi4iIRDAlehERkQgWkaPu/bF79262bNlCXl6e\n06FIJcTHx9OgQQNq1qzpdCgiIq4WlYl+9+7dbN68mSZNmpCUlIQxxumQpAKsteTk5LB+/XoAJXsR\nkcOIykv3W7ZsoUmTJiQnJyvJhyFjDMnJyTRp0oQtW7Y4HY6IiKtFZaLPy8sjKSnJ6TCkipKSktT1\nIiJSjqhM9IAq+Qig11BEpHxRm+hFRESigRK9iIhIBFOiD1PDhg2jXr16TocRNN9//z3GGBYsWOB0\nKCIiYU2JXkREJII5muiNMW8ZY7YYY74v43ljjBlpjFlpjFlmjDkj1DFKcXl5eRQUFDgdhoiI+Mnp\nCXPeAf4NjC3j+c5Ay6Kvs4FXir5LObZv387999/PlClT2LVrF2eccQbPP/88Z5/913/fv/71L8aP\nH8+KFSuoVq0aZ511Fs8//zzHHXecd5vU1FTq1atHx44dGT58OKtWrWLVqlW8+eab/Pvf/+bjjz9m\nwIABLFu2jNatWzNy5EguuOCCYrGMHj2a559/npUrV9KoUSNuvfVWBg0aVGybl19+maeeeort27eT\nlpbG7bffHtz/IAlv1sKLL8KkSbB7t9PRiJQp5wBs3gS5ub6f3/S3C2n/zfNBjcHRRG+tXWSMaX6Y\nTboBY621FlhsjKltjGlsrd0YyDjcdJeWtVXfR25uLhkZGezcuZNnn32WBg0a8Morr5CRkcEvv/xC\no0aNAFi3bh0DBw7k6KOPZvfu3bz66quce+65/PLLL9SqVcu7v08//ZRff/2V4cOHk5yc7H1u//79\nXHvttdx55500atSIRx55hJ49e7J69WqSk5MBePbZZ3nggQcYNGgQqampfP311wwZMoTk5GQGDhwI\nwJQpU7j11lu55ZZb6N69OwsXLqR///5V/4+QyPXCC3DXXU5HIVKuJKD5IY9zgD+AlKLHOzYdE/wg\nrLWOfhX9H3xfxnPTgfMPeZwFtC1vn23atLGHs3z58mKPPenVHV/+evjhh23dunV9Pjd69GgbHx9v\nV6xY4W3Ly8uzLVq0sPfcc4/Pn8nPz7f79++3RxxxhB0zZoy3vX379rZatWp206ZNpY4P2KysLG/b\n0qVLLWBnzZplrbV2165dtnr16nbYsGHFfnbIkCG2YcOGNj8/31pr7Zlnnmk7depUbJsbbrjBAjY7\nO/uw/w8lX0uJEmec4fwfq770VcGvvWDTwKaAXVXU9nnj7gH5kwC+stZ3ToyYwXjGmJuMMV8ZY776\n448/nA7HUfPmzaNNmzYcc8wx5Ofnk5+fD0D79u356quvvNstXryYDh06ULduXeLi4khOTmbv3r2s\nWLGi2P7atGlDw4YNSx0nISGB1NRU7+MTTjgB8FwpAPj888/Zt28fvXv39saRn59PWloamzdvZt26\ndeTn5/PNN9/QrVu3Yvvu2bNnQP4vJEKtXOl0BCIVsgdPX/R8YA1wIRCqTien++jLsx5odsjjpkVt\npVhrXwdeB2jbtq0NfmjutXXrVhYvXkx8fHyp54499lgA1qxZQ8eOHTnrrLN47bXXOOqoo0hISODi\niy/mwIEDxX7GV5IHqFGjBjExf50rJiQkAHh/fuvWrQCceOKJPn9+7dq1JCYmUlBQQIMGDYo9V/Kx\niNfOneqXl7CyC0+S//yQthuBUC3H5fZEPxUYaIwZj2cQ3i4b4P75SFSnTh3atm3LK6+8Uuq5xMRE\nAGbPns3+/fuZMmUK1atXByA/P5/t27eX+pnKTjVbp04dAKZPn+7zZKF169YkJSURGxtbanEaLVYj\nZVq1qnTb0UfDRx+FPBSRw7nxRvj8q138xj/IYbm3/b5r7qLnxdfwM9Ckca2ydxAgjiZ6Y8x/gFSg\nnjFmHfAwEA9grX0VmAlkAiuB/cB1wYjDRlj9n56ezty5c0lJSSmzMs7JySEmJoa4uL/eAh988IH3\nMn8gnHPOOSQlJbFhwwYuvvjiMrc7/fTTmTJlCrfccou3bdKkSQGLQyLM6tWl21q2hNNOC30sIocx\nd8tW1nAdHJLkBw9+iSefHBjSOJwedX9FOc9b4NYQhRN2Dh48yMSJE0u1d+7cmVdffZXU1FTuuece\nWrRowbZt21iyZAmNGjXizjvvJC0tjYKCAq677jquv/56fvjhB0aMGEHt2rUDFl/t2rUZNmwYd9xx\nB6tXr6Zdu3YUFhayYsUKsrOzmTx5MgAPPPAAPXv2ZMCAAfTo0YOFCxcye/bsgMUhEaasil7ERTZs\n2MyaNRnAodPEvMbgwTeFPBa3X7qXw9izZw+9e/cu1Z6dnU12djZDhw7l4YcfZvPmzTRo0ICzzjqL\nrl27AnDyySfzzjvvMGzYMCZPnsypp57Khx9+yOWXXx7QGAcNGsRRRx3F888/z7/+9S+qVatGq1at\nih2nR48evPTSSzz99NOMGTOG1NRU3nzzTS666KKAxiIRwleib9481FGIlKmwsJDOnbvwV5I3wJvU\nqXMdNWqEPh5jI+26NZ7BeIeOLi/pxx9/5G9/+1sII5Jg0WsZhXr2hKKrQV5jx8I11zgTj4gPL744\nn3/+82LgIJ454a7i9NPhm2+CczxjzNfW2ra+nlNFLyLhRRW9hIG6ddOAKXjG3HuuvDr1NlWiF5Hw\n4mswnvroxWHW2mJ3KHneph2LbePU2zRiJswRkSiwZw+UvAU0Lg6OOsqZeESAn3/+mfPPP59Vh1xt\nctOFJyV6EQkfvqr5pk09yV7EAcuXL6d9+/Z89tlnpKWlsXbtWsBdF56U6EUkfLipTJKot2zZMlJT\nU9m8eTMAmzdv9lb1bnqrKtGLSPjwVSYp0YsDvvnmGy688EL+XFvliCOOYPbs2VxwwQUUFqqiFxGp\nHE2WIy6wZMkS0tPTvVOG16pVi48//pgLLrgAgM2b4eDB4j9TsyYEcD6yClHHloiED1X04rBPP/2U\nzp07s2fPHgCOPPJIPv74Y9q0aePdpqzz0UouG1JlquhFJHyoohcHLViwgIsuusib5OvVq0d2dnax\nJA/uOx9Vog9Tw4YNwxjjc5rYXr16FVsnvjyrVq3CGMP06dMDGKFIELjtE1SixtKlS8nMzGTfvn2A\nZ/nuBQsWcOqpp5ba1m3no0r0YW7u3Ll8+eWXVdpH48aN+fzzzzn//PMDFJVIEOzfDyWXL46J8dxe\nJxJkJ510krewOuqoo1i4cCEnnniiz23ddj6qRB/G6tSpw8knn8wTTzxRpf0kJiby97//PaAr14kE\n3Jo1pduaNIH4+NDHIlEnPj6e8ePHc+ONN7Jw4UJat25d5raq6CVgjDE8+OCDTJ06le+++87nNhs3\nbqR///60aNGCpKQkWrVqxUMPPcTBQ4aElrx0369fP84888xS+xo1ahTJycne/qnCwkKefvppjjvu\nOBITE2nVqhVjxowJwm8qgvs+PSXqJCYm8vrrr3PccccddjtV9BJQvXv3pmXLlmVW9Vu3bqVOnTo8\n99xzzJ49m3vvvZe3336b2267rcx9Xn755Xz11Vf8/vvvxdonTJhAZmYmNYrWWbztttt4/PHHuemm\nm5gxYwY9evSgf//+6uuX4HDbp6dEtHHjxjFkyBAqusKrte47J9XtdeDcPQ++VPBNFRMTw+DBg7n+\n+ut59NFHadWqVbHnTz75ZEaMGOF9fN5551G9enX69+/PSy+9REJCQql9dujQgbp16zJhwgTuv/9+\nANavX88nn3zCBx98AMDKlSt55ZVXePvtt7n22msByMjIYOPGjTzyyCN06dKlQr+HSLnc9ukpEWvM\nmDFcd911WGuJj49n6NChfv/s1q2Qk1O8LTkZ6tULcJAVoIo+Alx99dWkpKTw1FNPlXrOWssLL7zA\nCSecQFJSEvHx8Vx11VXk5uayxlefJxAXF0fPnj2ZMGGCt+3DDz+kevXqXHzxxQBkZWURExNDjx49\nyM/P936lp6fz7bffUlBQEJxfVqKXKnoJgTfeeMOb5MHz2ffnSHt/uO0eelCijwhxcXEMGjSI9957\nj9UlPgxfeOEF7rnnHnr06MGUKVNYsmQJo0aNAuDAgQNl7rNPnz58++23rFixAvBctu/atStJSUmA\np0ugoKCAWrVqER8f7/3q168f+fn5bNy4MUi/rUQtN00eLhFp1KhR3HTTTd4kf9ppp5GdnU316tX9\n3ocbz0d16T5C9O/fn8cff5zhw4cXa//www/p1atXsT785cuXl7u/9u3b07BhQyZMmEDfvn1ZvHgx\ngwcP9j5fp04d4uLi+PTTT4mJKX2+2KBBgyr8NiI+uGnycIk4zz33HHfffbf3cdu2bZkzZw516tSp\n0H7c2MOkRB8hEhMTueeeexg8eDBt2rQhvuiWo5ycHBITE4tt+/7775e7v9jYWHr37s2ECROoVq0a\ntWvXplOnTt7n09LSKCgoYNeuXXTo0CGwv4xISbm5sGFD6faUlNDHIhHn6aefLlbI/P3vf2f27NnU\nqlWrwvty44UnXboHzwA4t3xVwc0330yNGjX47LPPvG0dOnRgwoQJvPzyy8yZM4e+ffuycuVKv/Z3\n+eWX88MPP/D888/TvXv3YgP3WrduzS233EKfPn0YPnw4WVlZzJgxg2eeeYYbbrihSr+HSClFa3wX\n07gxlDiJFakIay2PPvposSR/wQUXMHfu3EoleXDnhScl+giSnJzMnXfeWaxt6NChXHHFFTz00ENc\nccUVJCQkMHLkSL/2d95559GsWTM2btxInz59Sj0/atQohgwZwtixY8nMzKRfv37MmDGDdu3aBeT3\nEfFyY5kkYe+JJ57g4Ycf9j6+8MILmTVrlvcW4spw41vVVPQewXDQtm1b+9VXX5X5/I8//sjf/va3\nEEYkwaLXMkq8+SaUvFLUpw/85z/OxCMR4ZNPPqFTp07s27ePjh07MnnyZJKTkyu9P2uhVi0omlPM\na8MGzwWoYDLGfG2tbevrOVX0IuJ+biyTJOydf/75TJ8+nd69ezNlypQqJXmAHTtKJ/nERGjYsEq7\nrTINxpPIN2MGPPus77nSJTxs3Vq6zemOzwg0bhyMGgXRdXdsKpDKCSdUfU95eaXbUlI8ay85SYle\nItv330PXrlBY6HQkEmiq6ANqwQK46iqnowimAmAwcCPQMmRHdcPbVJfuJbL9979K8pFKFX1AHTIR\nZgTKB/oBzwJpwO+H3TqQ3PA2VaKXyPbrr05HIMHQrBmUWNdBqiZy/1TygKuA94oerwNeD9nR3TDN\nSNReurfWYty0mI1UmF93jPi6qVXCW6NG8N57EBvrdCQRJTL/VA4CfYDJh7TdDPhe7TOQYmPhxhuh\nZ8+gH6pcUZno4+PjycnJqfIIS3FWTk6OdwbAMvkarf3xx9CiRVBikiCLi/NU8zpJD6jCQt+JfulS\nqFkz9PEEQm7uAW69tRfZ2TO8bX373sbQoS+GpMhr0ACOOCLoh/FLVCb6Bg0asH79epo0aUJSUpIq\n+zBjrSUnJ4f169fT8HD3reTnw/r1pdvPOQcqsEiFSKTbssUzy/ChatSAU08Nz3Oq/fv306NHD7Kz\n53rb7rnnHp555pmo/LyPykRfs+gUdcOGDeT5uh9CXC8+Pp6GDRt6X0uf1q2Dksvl1q+vJC9SQlnT\nFIRjTty3bx+XXHIJ2dnZ3rYHH3yQxx57LCqTPERpogdPsj9skpDw58ZJp0VcKFL+VPbu3Uvnzp35\n5JNPvG2PPPIIQ4cOdTAq50VtopcooNnURPwSKX8q1apVo0mTJt7HTz31FPfff7+DEbmDEr1Erkj5\n9BIJskip6OPi4nj33XfJy8vj/PPPL7XIV7RSopfIFSmfXiJBFknnxPHx8Xz44YfEOD3vrIvof0Ii\nVyR9eokEUbieE2/evJkRI0aUmlNDSb44VfQSucL100skhKwNz3PiDRs2kJ6ezk8//cTWrVt56qmn\nonZUfXl02iORqaDA92p1SvQixWzbBvv3F29LSoJ69ZyJxx9r166lffv2/PTTTwCMGDGC7777zuGo\n3EuJXiLTxo2eCXMOdeSR4TvNl0iQhNs99L///jvt2rVj5cqVgGcA3vjx4znllFMcjsy9dOleIlM4\nXosUcUA49XCtXLmStLQ01q5dC/w18K5bt24OR+ZuSvQSmcLp00vEQeFyTvzTTz+Rnp7Ohg0bAEhM\nTGTy5Ml07tzZ4cjcT4leIlO4fHqJOCwczom///57MjIy2Lx5MwBJSUlMnTqVjIwMhyMLD0r0EpnC\n4dNLxAXcfk78v//9j4yMDLZu3QpA9erVmT59Oqmpqc4GFkaU6CUyuf3TS8QlfJ0Tu+lP5eDBgxw8\neBCAGjVqMGvWLM477zyHowovGnUvkcntn14iLuHrnNhNF7/OPPNMZs2aRdOmTfn444+V5CtBFb1E\nnsJCXboX8cPOnbB7d/G2xERo2NCZeMpy7rnnsnLlShITE50OJSypopfIs3kz5OYWb6tZE2rXdiYe\nEZfyVc2npICTM8guWLCA5cuXl2pXkq88JXqJPGVV826dAUTEIW7r4Zo7dy6dO3cmPT2dFStWOBdI\nhFGil8ijgXgifnFT//yMGTO45JJLOHDgAJs2baJv376lFquRylGil8ij/nkRv7ilov/oo4/o0aOH\nd3R9SkoK77//vhapCRAleok8quhF/OKGiv7DDz+kd+/e5OXlAXDMMcewaNEijj322NAGEsGU6CXy\nqKIX8YvTFf37779Pnz59yC9agKply5YsWrSIo/X3GlBK9BJ5VNGL+MXJiv7tt9/mmmuuobCwEIC/\n/e1vLFy4kKZNm4YmgCiiRC+RxVpV9CJ+2LMHtm8v3hYXB0cdFfxjv/baa/Tv39872O7kk09mwYIF\nNG7cOPgHj0JK9BJZtm6F/fuLtyUnQ716zsQj4lK+zoebNYPY2OAe11rLF1984X182mmnMX/+fBo0\naBDcA0cxzYwnkaWsTkeN3hUpxqkeLmMMb7zxBrm5ufzyyy/MmTOHI488MvgHjmJK9OKs3Fy4916Y\nOhX27q36/opuzylGl+1db+dOGDgQsrNLT2oowXHgQOm2UP2pxMbGMmbMGHJycqhRo0ZoDhrFlOjF\nWUOHwksvBfcYGojnejfcAP/9r9NRSDD+VKy1zJw5k8zMzGL3xcfFxSnJh4j66MVZkyYF/xiq6F0t\nL89zQUecF+g/FWstDz74IF26dOGf//ynZrpziBK9OKegwHdHYaB16hT8Y0ilrV/vSfbirJgY6NAh\ncPuz1nLPPffw1FNPATBy5EjeeuutwB1A/KZEL87ZsAGKJsoIitq14YUX4NRTg3cMqbJQnOvJ4dWv\nD++8A02aBGZ/hYWF3H777Tz33HPeti5dunDVVVcF5gBSIeqjF+f4+oQ/5RTIygrM/o88Mvj3CkmV\n+XobdOsGo0eHPJSoVbdu4G5MKSws5JZbbuGNN97wtvXs2ZP//Oc/JCQkBOYgUiFK9OIcX7fCHXus\n7nmPMr7eBi1b6m0QjgoKCrjhhht45513vG2XX3457777LvHx8c4FFuV06V6co6lqBb0NIkV+fj59\n+/YtluSvueYa3nvvPSV5hynRi3M0Va2gt0EkyMvL48orr2TcuHHetv79+/P2228TF6cLx05Tohfn\nqJQT9DaIBKtWrWLevHnex3/20cdqjIwrKNGLc1TKRb2CAli7tnS73gbhpWXLlnz88cfUqlWL22+/\nnZdffpmYGKUXt9A1FXFGYaHzi2GL43zdYVmnDmjCtPDTpk0bvv32W44++uhiM+CJ8xw/5TLGdDLG\n/GyMWWmMud/H8ynGmGxjzFJjzDJjTKYTcUqAbd5cel76mjU9975L1NBFnfC0d+9efvzxx1LtzZs3\nV5J3IUcTvTEmFhgFdAZOAK4wxpxQYrOHgA+stacDfYCXQxulBIU6ZgW9DcLR7t276dSpE+3ateP7\n7793Ohzxg9MV/VnASmvtb9bag8B4oFuJbSxQs+jftYANIYxPgkWlnKC3QbjZuXMnHTt25NNPP2Xr\n1q1kZGSwY8cOp8OScjjdR98EOHQozjrg7BLbDAPmGmNuA6oDGaEJTYJKpZygt0E42bZtGx07duSb\nb77xtg0ePFhryYcBpyt6f1wBvGOtbQpkAu8aY0rFbYy5yRjzlTHmqz/++CPkQUoF+fqEVykXdZTo\nw8Mff/xBWlpasST/8ssvc8cddzgYlfjL6US/Hmh2yOOmRW2Huh74AMBa+zlQDSg1Oaa19nVrbVtr\nbdv69esHKVwJGI24F3TpPhxs2rSJ1NRUli1bBoAxhtGjRzNgwACHIxN/OZ3ovwRaGmOOMcYk4Bls\nV3Jl6jVAOoAx5m94Er1K9nCnij7q6Q5L91u/fj3t27dn+fLlAMTExDBmzBiuv/56hyOTinC0j95a\nm2+MGQjMAWKBt6y1PxhjHgW+stZOBe4G3jDG3IlnYF4/a611LmqpMmv1CS+6w9Ll1qxZQ1paGr/+\n+isAsbGxvPfee/Tp08fhyKSinB6Mh7V2JjCzRNvQQ/69HDgv1HFJEP3xB+TkFG9LTvaslSlRQ/3z\n7jZ69Ghvko+Li2P8+PFceumlDkclleH0pXuJRmVV85poI6qof97dhg0bRv/+/UlISGDSpElK8mFM\niV5CT/3zgip6t4uJieH1119n8eLFXHLJJU6HI1WgRC+hp/55QRW926xevZrCwsJibbGxsZx++ukO\nRSSBokQvoaeKXlBF7yZLly6lTZs23HzzzaWSvYQ/JXoJPVX0gip6t/jyyy9JS0tj27ZtjB49mvvv\nL7W2mIQ5JXoJPVX0Uc9aVfRu8Pnnn5ORkcHOnTsBqF27Nr1793Y4Kgk0JXoJLd1DL8DWraXvsKxe\nXXdYhtKiRYvo2LEju3fvBqBu3brMnz+fM8880+HIJNCU6CW0duyAPXuKtyUmQoMGzsQjjijroo7u\nsAyNrKwsOnfuzN69ewGoX78+2dnZGngXoZToJbTK+oSP0VsxmuiyvXPmzJlDly5d2L9/PwCNGjVi\nwYIFnHzyyQ5HJsGiT1cJLY3AEvQ2cMq0adPo2rUrBw4cAKBJkyYsXLiQE044weHIJJiU6CW0VMoJ\nehs4IScnhwEDBnCwaIGBo48+mkWLFtGqVSuHI5NgU6KX0FIpJ+ht4ISkpCRmzpxJnTp1aNGiBQsX\nLqRFixZOhyUh4PiiNhJmfv4ZbrwRvvwSCgoq/vP5+aXbVMqFnU8+gdtvhx9+8NxIUVF5eaXb9DYI\nvlNOOYWsrCzq169PkyZNnA5HQkSJXvxnLXTtCitWBHa/KuXCyt69kJlZ+uaJqtLbIPC2b99OnTp1\nirWddtppDkUjTtGle/Hfzz8HPsmDSrkws2BB4JN8tWq6wzLQXn31VVq2bMnSpUudDkUcpkQv/vv9\n98Dv89RTQZcQw0ow3gaZmbrDMpBGjhzJgAED2L59Ox06dGD58uVOhyQO0p+W+M/XUOnKiomBs8+G\nCRM0S0qYCeTbID4eLroIXn45cPuMds8++yx33HGH9/Gxxx5L48aNHYxInKY+evGfr6HSDzwADz9c\n8X3FxECc3n7hyNfbYPRouOaaiu8rNtbzJYHx+OOPM2TIEO/jc889l1mzZlGzZk0HoxKn6ZNW/Oer\nlDv2WEhICHko4hxfb4PjjtPbwEnWWh5++GEee+wxb1u7du2YMWMGRxxxhIORiRso0Yv/dPOzoLeB\n21hrGTx4MMOHD/e2paenM2XKFKpXr+5gZOIWSvTiP01nFvX27fOsPHeo2Fho2tSZeKKdtZa77rqL\nF154wdvWqVMnJk2aRFJSkoORiZtoMJ7458AB2LSpeJsx0KyZM/GII3xV802aaLiFUyZPnlwsyXft\n2pWPPvpISV6KUaIX/6xZU7rtqKPUMRtldFHHXXr06MGtt94KwKWXXsqHH35IYmKiw1GJ2+g8XPxT\n1vKyElV8VfRK9M4xxjBy5EhOP/10rr32WuJ0aUV80LtC/KNPeEHne07LL1or4tCEHhMTw/XXX+9U\nSBIGdOle/KNPeEHne07Ky8ujT58+9O/fn4LKLCglUUsVvfhHn/CCzveckpuby2WXXcbUqVMBiI+P\n54033iBG8waLH5ToxT8ahSXofM8JOTk5XHrppcyaNcvbVrNmTYymjhY/6XRQ/KNZUqKe7rAMvf37\n99O1a9diSf6+++7jueeeU6IXvynRS/kOHoT160u3p6SEPhZxjO6wDK29e/eSmZnJvHnzvG1Dhw7l\nqaeeUpKXCtGleynf2rVgbfG2hg1Bk3JEFfXPh86uXbvIzMzks88+87Y9/vjjPPjggw5GJeFKiV7K\np45ZQW+DUNmxYwcXXXQRX375pbft2Wef5Z577nEwKglnSvRSPpVygt4GoXLNNdcUS/Ivvvgit99+\nu4MRSbhTH72UT6WcoLdBqIwYMYIGDRoA8OqrryrJS5WpopfyqZQTdIdlqBx//PFkZWWxdOlSrrnm\nGqfDkQigRC/lUykn6A7LYCksLCw18c1JJ53ESSed5FBEEml06V7Kp4o+6ukOy+BYvXo1bdq0YfHi\nxU6HIhFMiV4OLz8f1q0r3a5EH1XWrdMdloH222+/0a5dO7799ls6derE119/7XRIEqGU6OXw1q+H\nkgto1K0LRxzhTDziCPXPB9aKFSto164da4pmIcrJyWHz5s0ORyWRSoleDk+f8IL65wNp+fLltG/f\nnvVFfSHVqlVj6tSpZGZmOrRq57QAACAASURBVByZRCoNxpPD0ye8oPO9QFm2bBkZGRn88ccfACQn\nJzNt2jTS0tIcjkwimRK9HJ4+4QWd7wXCN998Q4cOHdi+fTsARxxxBDNnzuSCCy5wODKJdLp0L4en\nT3hB53tVtWTJEtLT071JvmbNmsydO1dJXkJCFb0cnj7hBZ3vVcXq1avJyMhgz549ANSuXZuPP/6Y\ntm3bOhyZRAtV9HJ4+oSPWNZ61pgv72vfPs8ChiXpbeCflJQUbr75ZgDq1q1Ldna2kryElCp6KVth\noe9FyPUJH/aeew6eeQYqe0dXvXq6w9JfxhieeeYZkpKSuOyyyzTjnYScEr2UbeNGyMsr3larFtSu\n7Uw8EhCLFsHdd1dtHzrXqxhjDI8++qjTYUiU0qV7KZtmxItIH39c9X0cc0zV9xGppk2bxtVXX01+\nfr7ToYgAqujlcHbuLN1Wr17o45CA2rat6vvo1q3q+4hEkyZN4vLLLyc/Px9rLWPHjiU2NtbpsCTK\nKdFL2XbtKt2my/Zhz9f5W1wc+JOPGjSAG26Aq64KfFzhbvz48Vx99dUUFE0Z/cUXX7Bt2zbv2vIi\nTlGil7L5yghK9GHP18s6aRJccknoY4kUY8eO5brrrqOwsBCA1q1bk5WVpSQvrqA+eimbr4xQq1bo\n45CA0vlbYL355pv069fPm+RPOOEEFixYQJMmTRyOTMRDiV7KpowQkXz1yOj8rXJefvllbrjhBmzR\nGr6nnHIKCxYsoFGjRg5HJvIXJXopm/roI5LO3wLjhRde4NZbb/U+PuOMM5g/fz7169d3MCqR0pTo\npWzKCBFJL2vVvfLKK9x5553ex2effTZZWVnUrVvXwahEfFOil7Kpjz7i5OXB/v3F22JiNMtdRXXs\n2NHbB3/eeecxd+5cautsSVxKiV7KptIv4vjqjalZ05PsxX/HHnssWVlZXH755cyePZuaNWs6HZJI\nmXR7nZRNffQRR+dugdO6dWvGjx/vdBgi5apUojfGnAC0A1KAekAOsAX4Flhkrd0TsAjFOcoKEUcv\nacVZaxk8eDCZmZm0a9fO6XBEKszvRG+MaQrcBPQHGv/ZXGIzCxQYY+YBrwDT7Z/3nUj4UR99xFGi\nr5jCwkIGDhzIK6+8wqhRo5g7dy7nnHOO02GJVEi5id4YUwcYBtwMxAOrgHHAl8AmYDuQBNQFjgfO\nAVKBi4CfjTF3W2tnBT50Caq8PM9C5IcyxtOhK2FL527+Kygo4Oabb+bNN98EYO/evYwePVqJXsKO\nPxX9SiARGA2MsdYuKe8HjDE1gT54rgBMN8bcaa0dWaVIJbR27y7dplFbYU/DLvyTn59P//79effd\nd71tV155Ja+99pqDUYlUjj+J/l3gSWvtZn93aq3dDbwOvG6M6Q5Uq2R84hRd441IelnLl5eXR9++\nfYsNtOvXrx+jR4/WSnQSlspN9NbaO6pyAGvtR1X5eXGIMkJE0st6eAcPHuSKK65g0qRJ3rabbrqJ\nV155hRhdzZIwVeF3rjEmpejS/OG2qWGMSal8WOI4deZGJCX6suXm5tKrV69iSX7gwIG8+uqrSvIS\n1irz7v0dKK/Kv71oOwlXyggRSQva+Jafn0/37t2ZNm2at+3uu+9m5MiRGFPy5iKR8FKZRG8ofVud\nRBqN2opIOn/zLS4ujjPPPNP7+IEHHuDZZ59VkpeIEKyZ8RoB+8rdStxLGSEi6WUt2yOPPEJubi7J\nyckMHTpUSV4ihl+J3hjTt0TTaT7aAGLxzJZ3NfBdFWMTJ6mPPiIp0ZfNGMPTTz+tBC8Rx9+K/h08\ns95R9L1b0VdJf/6F7AceqVJk4ixlhIikPnqP7du388wzz/Doo4+SkJDgbVeSl0jkb6K/rui7Ad4C\nPgKm+NiuANgGfG6t9ZEpSjPGdAJexHM1YLS19mkf21yGZ3Y+C/zPWnuln3FLZamPPiLp/A22bt1K\nhw4d+Pbbb/nll18YP3488fHxToclEjR+JXpr7Zg//22MuRb4yFo7tqoHN8bEAqOADsA64EtjzFRr\n7fJDtmkJDAbOs9buMMY0qOpxxQ/KCBGnoKDsCQ+jxebNm8nIyOD7778HYPLkySxcuJCMjAyHIxMJ\nngoPxrPWXhjA458FrLTW/gZgjBmPp0tg+SHb3AiMstbuKDr+lgAeX8qiRB9xfCX5GjUgLkoWq96w\nYQPp6en89NNPgOcy/VtvvaUkLxGv0n/ixphkoCdwOlAb2AV8A0y21vo74r4JsPaQx+uAs0ts06ro\neJ/iubw/zFo7u7Jxi580GC/iRPNLunbtWtLS0li5ciUAMTExjB07lquuusrhyESCr7Lr0WcCY4A6\nFL+n3gLPG2Ous9ZOD0B84ImxJZ4V8ZoCi4wxJ5ccA2CMuQnPIjqkpGhSvipTH33EidaXdNWqVaSl\npfH77545vOLi4hg3bhy9e/d2ODKR0KjMFLhnAJPwVPHv41mfvnPR9/eL2icaY9r4sbv1QLNDHjct\najvUOmCqtTbPWvs7sAJP4i/GWvu6tbattbZt/fr1K/hbSSm6dB9xovEl/fXXX2nfvr03ycfHxzNx\n4kQleYkqlZkZ70E8lfsF1tq+1tp3rLVzir73Bc4vev4BP/b1JdDSGHOMMSYBz9K2U0ts8xGeah5j\nTD08l/J/q0Tc4q/CQt8dutFynTdCRVuiX7FiBe3atWPNmjUAJCYm8tFHH9Gtm687g0UiV2US/QXA\nh9baxb6etNZ+AUws2u6wrLX5wEBgDvAj8IG19gdjzKPGmK5Fm80BthljlgPZwL3W2m2ViFv8tXs3\nWFu8rXr16Bm1FaGirY++WrVq3nvkq1WrxtSpU8nMzHQ4KpHQq8wndy2KD6DzZQ3g10071tqZwMwS\nbUMP+bcF7ir6klCI1s7cCBdtL2tKSgrz588nMzOTl19+mQsvDOQNQyLhozKJfgOe2+IOpy2wsRL7\nFjeItmu8USIaX9ZjjjmG7777jjhdjZIoVplL9zOBNGPM/UUT3ngZY2KMMXcDGZSo0iWMRGNGiAKR\n/rIuWbKEefPmlWpXkpdoV5m/gMeA7sATwM3GmP/DU703wjMQrzmwCXg8QDFKqEVbZ26UiORE/+mn\nn9K5c2fy8/OZOXMmqampTock4hqVmRlvkzHmPOA1PFPXHl1ik4+BW6y1unQfriI5I0SxSF3QZsGC\nBXTp0oV9+zzzdF133XX8/PPPxRarEYlmlbqmZa1dBVxkjGmCZ2a8WnhmxltqrS15H7yEm2gbtRUl\nIvH8bd68eXTt2pWcnBwAGjZsyLRp05TkRQ5Rpc6roqSuxB5pIjEjSMS9rDNnzqRnz57k5uYC0Lhx\nY+bPn8/xxx/vcGQi7lKZmfE+MMZ0NsZUZiCfhINIywgCRNbLOmXKFLp37+5N8s2aNWPRokVK8iI+\nVCZZ9wKmA+uNMc8aY04KcEziNA3Gi0iR0kc/ceJEevXqRV5eHgDNmzdn0aJFHHfccQ5HJuJOlUn0\nf8czEC8BuBv4nzHmK2PMbUVT1Eq4Ux99xLE2Ms7fxo0bR58+fcjPzwfg2GOPZdGiRTRv3tzZwERc\nrMKJ3lq7xFr7D6AxcBme++VPAV7EU+VPMsZ0N8bo5tVwFUnXeAWAvXs9SxgcKikJEhOdiaeyVq9e\nTUFBAQCtW7dm0aJFNGvWrJyfEolulU7G1tqDeOa0n2iMqQ9cDVyL5x77bsA2oEEggpQQU6KPOJHy\nkg4ePJjc3FwmTpxIVlYWDRs2dDokEdcLyIA6a+0f1trn8dxqdw+QD9QNxL7FAZFwjVeKiaSX9OGH\nH+aLL75QkhfxU0ASvTGmtTHmSWA18CwQD6wMxL7FAeqjjzjh+pL+97//5cCBA8XajDFUr17doYhE\nwk+lE70xprYxZoAxZjGwHLgfz4p1b+JZq751gGKUUIqUUVtSTDheun/66afp1asXvXr14uDBg06H\nIxK2KnMf/SXGmA/xzG//bzwr1c3D00ffyFp7k7X208CGKSGzbx8UDXbyqlbN8yVhK5wSvbWWRx99\nlMGDBwMwY8YMnnjiCYejEglflRmMN6Xo+wpgDDBW095GEFXzESlcEr21loceeognn3zS23bhhRcy\naNAgB6MSCW+VSfSvAWOstYsDHYy4QLh25sphhcNkOdZaBg0axIgRI7xtHTt2ZPLkySQnJzsYmUh4\nq8zqdQOCEYi4RLiUflIhbn9ZrbX885//ZOTIkd62iy++mIkTJ1JN3UYiVaJJbaQ4t2cEqRQ3v6yF\nhYX84x//4LXXXvO29ejRg/Hjx2sVOpEAKDfRG2PmAxa41lq7ruixP6y1Nr1K0UnouTkjSKW59WUt\nKCjgxhtv5O233/a2XXbZZbz33nvEx8c7GJlI5PCnok/Fk+iTD3nsD1uJeMRpGowXkdzaR79jxw7+\n7//+z/v46quv5u233yYuThcbRQKl3L8ma23M4R5LhNFgvIjk1oq+Xr16zJ8/n/bt25Oamsobb7xB\nbGys02GJRBSdNktxbs0IUiVuflmbNWvG4sWLqVevHjExqiNEAk1/VVKcmzOCVJpbXtYDBw7w+eef\nl2pv0KCBkrxIkFRlCtyrjDFZxpjtxpj8ou/zjDFXBTJACTH10Ucct8xqvH//frp160Zqaipz5swJ\n7cFFolhlpsCNN8ZMAcYCFwI1gD+KvqcBY40xU4wxGjIbjtRHH3EOHIC8vOJt8fGe9ehDZd++fXTp\n0oW5c+dy8OBBunfvzq+//hq6AESiWGUq+sHAJcAXeBJ9NWttY6AankS/BOgC3BeoICWE3HKNVwKm\nrJfUmNAcf8+ePXTq1Ins7Gxv2wMPPMCxxx4bmgBEolxlEn1fPEvQplprF1prCwCstQXW2gV4br/7\nDegXoBgllJToI46TL+nOnTvp2LEjn3zyibft6aefZsiQIaEJQEQqleibAlOstT7XjbTW5uJZ+KZJ\nVQIThyjRRxynXtLt27eTkZHB4sV/LYvx3HPPcd99utgnEkqVub1uA1Be/3t80XYSbtw6s4pUmhMv\n6R9//EGHDh343//+523797//za233hrcA4tIKZWp6McBvYwxNX09aYypDfQC3q9KYOKAAwcgN7d4\nW1wcaOWwsBbqin7Tpk1ceOGF3iRvjOH1119XkhdxSGUq+keBk4AlxphHgUXAZqAh0B4YgmdA3mOB\nClKqKCcH1q8vf7utW0u3hXLUVgSzFtauhYM+O7yC65dfSrcFM9HPnj2bH374AYCYmBjeeustrr32\n2uAdUEQOqzKJPqfouwHe9fG8AVoCB0zxBGGttZqJL5SshUcfhSefrHyGUf98lX3zDfTsCatXOx3J\nX4L5svbr148tW7bwwAMP8O6773LFFVcE72AiUq7KJN7/QwvWhIcvv4Rhw6q2D/XPV1n//u5K8hD8\nl3XQoEF07dqV448/PrgHEpFyVTjRW2tTgxCHBMO8eVXfR8OGVd9HFNu6FQ4Zj+YagXxZV61aRYMG\nDUguMZZDSV7EHTS5dCQLRBl56aVV30cUc1slD1CtGnTuHJh9/fTTT5x33nl069aNAwcOBGanIhJQ\n6jOPZKtWlW5r3Ni/UfR168KVV8J11wU8rGji6yWoXh0aNQp5KAC0agWDB0PTplXf1/fff09GRgab\nN29mw4YNXH311UycOLHqOxaRgCo30Rtj7gH+ba2t1Om6MeZ0oJG1dlZlfl6qwFc5OX06nHFG6GOJ\nUr5egquvhldfDX0sgfS///2PjIwMthbdqVG9enUGDhzocFQi4os/l+6fAH41xtxnjDnKn50aj4uM\nMZOBr4BTqxKkVIK1vrNM8+YhDyWa+arow/0l+Prrr7nwwgu9Sb5GjRrMmTOH1NRUZwMTEZ/8uXR/\nMvAc8BTwuDHmM+ATPAl8I7ADz4I2dYHjgb8D6UAjYBswEHgt4JHL4W3Z4pkA51BHHAFHHulMPFEq\n0s61Fi9eTKdOndhVNN1erVq1mDNnDmeffbbDkYlIWcpN9NbaFUAXY8y5wK3ApcAF+L7F7s8b538G\nhgNvW2v3BChWqYiySklNfhNSvl6Go48OeRgB8cknn5CZmcmePZ4/6Tp16jB37lzatGnjcGQicjh+\nD8az1n4GfGaMuQVoB5wPpOCp5HOALcAyYIG19ocgxCoV4auUDNcME8Yi5dL9ggUL6NKlC/v27QOg\nXr16zJs3j1NPVa+ciNtV5j76PcCMoi9xq0jJMGFs507Yvbt4W0JC+E1NUFhYyL333utN8g0bNiQr\nK4sTTzzR4chExB+6jz5SRVrncBgq67J9TJj91cXExDB16lRatWrFUUcdxcKFC5XkRcKIXxW9MaYv\n8K21dlmQ45FAiaTO4TAVSb0njRs3Zv78+eTk5HDcccc5HY6IVIC/tcU7QPdDG4wx1xpj5gc8IgkM\nVfSOC+fek40bN5Zqa9KkiZK8SBiqykXE5niWpRW3sVYVvQuEa0X/3nvv0aJFC6ZNm+Z0KCISAGHW\nWyh+2b4digZOeSUlQf36zsQTpcKxon/77bfp27cvBw4coFevXsyfr4t2IuFOiT4SlVXN6x76kAq3\niv61116jf//+WOuZIqN169acdNJJDkclIlWlRB+J1D/vCuFU0b/00kvccsst3sennXYa8+fPp0GD\nBg5GJSKBUJFE72smPHEj9c87bs8eTw/KoeLi4Ci/VosIrX/961/cfvvt3sdnnnkm8+fPp169eg5G\nJSKBUpEJc4YZY4aVbDTGFJSxvbXWahlcJ6iid5yvl6BZM4iNDX0sh/Pkk0/y4IMPeh+fc845zJo1\ni1q1ajkYlYgEUkUqelPBL3ULOEUVvePcfq5lrWXYsGHFkny7du2YM2eOkrxIhPGr4rbWKmmHk3Dq\nHI5Qbj/X+vLLL3nkkUe8j9PS0pg6dSrVq1d3MCoRCQYl8EgUbsO9I5Dbz7XOOussRo4cCUCnTp2Y\nPn26krxIhFIfeqTZuROK1gr3SkiARo2ciSdKhcO51m233UbTpk3JzMwkMTHR6XBEJEgqnOiNMacC\nVwJnAfXxjMb/A/gCGGet/S6gEUrF+MowKSnht5JKmHNbRV9YWEhOTk6pqr1Hjx4ORSQioeJ3ojfG\nxAIvATfx14C7Q7UH7jXGvAzcYf+cdUNCy20ZJkq5qaIvKCjg+uuv59dff2X27Nm6RC8SZSpS0Y8A\nbgEOAh8AC4D1eBL+UUAa0Au4FTgADApkoOInN2WYKLV/P2zZUrwtJgaaNg19LPn5+Vx77bWMGzcO\ngEsuuYQZM2aQlJQU+mBExBH+LlN7HHAbsBroZK392cdmbxljHgdmA3caY1611v4WuFDFL6roHbdm\nTem2Jk0gPj60ceTl5XHllVcyceJEb1uLFi1ISEgIbSAi4ih/O26vwVO59ysjyQNgrf0JuBaIBa6u\nenhSYaroHeeGc63c3Fx69+5dLMkPGDCA119/nVi3zdojIkHlb6I/F/jRWruwvA2LtlkOnF+VwKSS\n3JBlopzTk+UcOHCAnj17MmXKFG/bHXfcwahRo4jRoEyRqOPvX/3xeEbV++uLop+RUFNF7zgnJ8vZ\nv38/l1xyCTNnzvS2DRo0iOeffx6j1QtFopK/ib42sKXcrf6yGTiy4uFIlezdC9u2FW9z60oqEcyp\nin7v3r1cfPHFzJs3z9s2ZMgQnn76aSV5kSjm76j76kBOBfabCyRXPBzxy4YNsGlT6XZfpWTTpp5k\nLyHjVEV/9913s2DBAu/jxx57jIceeij4BxYRV1MGCCc5OXDFFXBI32u51D8fck5V9I8//jiffvop\nP/zwA8888wz33ntv8A8qIq5XkUTf3RjT3M9tT694KFKusWMrluRB/fMhlpvrueBSUrNmwT92/fr1\nycrKYtasWfTr1y/4BxSRsFCRRH9a0Ze/NDNeoB1yWdZvxx0X8DCkbGvXlm5r3BiCMZV8fn4+cSW6\nZRo2bKgkLyLF+JvorwtqFOIfX52/hxMfD5ddFpRQxLdQ3d24adMmLrroIh566CF69+4d+AOISMTw\ndz36McEORPzgK4ucdJLvKdeOPhr++U9o1SroYclfQpHo169fT1paGitWrODKK68kPj6e7t27B/Yg\nIhIxHB+MZ4zpBLyIZza90dbap8vY7lJgInCmtfarEIboDgcOlB5pbwx8/bVnGVpxhWBPY7BmzRrS\n0tL49ddfAbDWcuDAgcAdQEQijt/TZBlj/mGMGWyMKXPGbmNMQtE2A/zcZywwCugMnABcYYw5wcd2\nNYA7qNikPZHF1wTqRx2lJO8ywazof//9d9q3b+9N8nFxcUyYMIE+ffoE5gAiEpH8SvTGmHPxLFGb\naK3NK2s7a+1BIAH4tzHmbD92fRaw0lr7W9HPjge6+djuMWA4nlXxopNmvAsLwbq17pdffqFdu3as\nKjqTSEhIYNKkSVx66aVV37mIRDR/K/prgb14lqotzwhgD9Dfj22bAIeOU15X1OZljDkDaGatneFf\nqBFKc9iHhWBMlvPjjz/Svn171q1bB0BiYiJTpkzhkksuqdqORSQq+NtHfwGQZa3dW96G1tp9xpis\nop+pEmNMDPAc0M+PbW8CbgJISUmp6qHdRxW96+Xlwfr1pdur8jJ9//33pKens6VogfukpCSmTZtG\nenp65XcqIlHF34o+BfilAvtdWfQz5VkPHDqVSNOitj/VAE4CFhhjVgF/B6YaY9qW3JG19nVrbVtr\nbdv69etXINQwoYre9datg8LC4m0NGkBSUuX2t2vXrmJJvnr16syaNUtJXkQqxN9EH0vFJsCxfu77\nS6ClMeYYY0wC0AeY6t2JtbustfWstc2ttc2BxUDXqBx1r4re9QLdP1+rVi0ef/xxAGrUqMHcuXNp\n37595XcoIlHJ30v3fwDHVmC/xwJby9vIWptvjBkIzMFzMvGWtfYHY8yjwFfW2qmH30MUUUXvesHo\nn7/xxhsBOPXUUznrrLOqtjMRiUr+JvovgQ7GmFrW2l2H29AYUwvoAMw73HZ/stbOBGaWaBtaxrap\nfkUbaQ4e9D2BeiSORQhjgajorbWllpT9M9mLiFSGv5fu/wPUxHPPe3n+jadv/T+VDUpK8NX527Bh\n5Tt/JSiqWtFnZWWRnp7O7t27AxaTiIi/if6/wGd4JrRZaIzJKOpTB7wT5WQYYxYAVwKfWmv/G/hw\no5RT655KhVSld2XOnDl06dKF7OxsOnfuzN695d7gIiLiF3/nurdFU9DOwXPb3Bwg3xizrWiTukX7\nMsD/gF5BiDV6BaPzVwKusuMlp0+fzqWXXsrBgweL9rOaLVu2cMQRRwQ4QhGJRn5PgWut3QycAwzB\nM8lNPNCo6Cu+qO0h4Fxr7ZbAhxrFNBDP9QoKfC9RW16inzx5Mj179vQm+ZSUFBYuXEiLFi2CEKWI\nRKMKLWpjrc0BngCeMMY0BRoXPbXRWrsu0MFJEd1a53obNkB+fvG2OnWgRo2yf2bChAlcddVVFBQU\nANCiRQvmz5/P0XptRSSAKr16XVFiV3IPBVX0rlfRl+jdd9+lX79+FBYNsmzVqhVZWVk0bdo0KPGJ\nSPTyd1GbdsYYv+/lMsacYozpW/mwpBhV9K5XkZforbfe4tprr/Um+RNOOIEFCxYoyYtIUPjbR59N\nifnmjTH3HTIYr6QewNtViEv+lJ9fuc5fCSl/K/qJEydy/fXXY61nosmTTz6Z7OxsGjduXHpjEZEA\n8DfRGx9t1YDaAYxFfNmwwTPS61B164JGZLuKv3dApqenc/rppwNwxhlnkJ2dTYMGDYIbnIhENb9H\n3YtD1D8fFvy9A/LII49k7ty59OvXj6ysLOrWrRv02EQkulV6MJ6EiCbLCQsVeZnq1avH22+rZ0tE\nQkMVvdtpshzXKyz0nehTUiwPP/wwY8eODX1QIiJFVNG7nSp619u0ybPu0KFq1rQMHz6Y4cOHExMT\nQ0JCAn369HEmQBGJahWp6CuyHr0Eiip61yt9LmZJSLiL4cOHA1BYWMi4ceO8I+1FREKpIhX9MGPM\nsJKNxpgCH9tKoKiid73i52KFwG1s3fqyt6Vr16588MEHpZafFREJhYok+op+Sql8qaqyOn9V0bvK\nXy9RIXAzMNr73KWXXsq4ceNISEjw8ZMiIsHn7+p1GrTnhI0bIS+veFvt2lCrljPxiE+eir4A6A/8\nNfDuiiuuYOzYscTFaSiMiDhHn0Bupmo+LPz+ez7QF/iPty019VreffdNYmNjHYtLRASU6N0tTCfL\n2b8fvv229Ej0SGSt5dNPrwYmHNJ6A8888xqxsboQJiLOU6J3szCs6LOyoEcP2LPH6UhCxQBdgA/w\nDEv5B/ASxxyjJC8i7qBE72ZhVtFbC7feGk1J/k9XA3nA98AIqlc3aGZbEXELJXo3C7OKfutW+Pln\np6NwynXef7VqBbqTTkTcQtcX3SzMKnpf4UaevcCtwPYyt7juujKfEhEJOVX0bmVt2E2W4yvcevXg\nxBNDH0sw5Ofv5rvvMtm9+1Nq1FjCKafMIy7ur1sda9SA7t2hf38HgxQRKUGJ3q22bIEDB4q3HXEE\nHHmkM/H4wVdF36sXvPJKyEMJuB07dtCpUyd2714CwJ49XzFgwHSuuuoqhyMTETk8JXq3Kquad3Hn\nb5gNKfDbtm3b6NChA0uXLvW2vfjii0ryIhIWlOjdKgwXswmzIQV+2bJlCxkZGXz33XfetldffZWb\nb77ZwahERPynRO9WYdY/D2EZ8mFt3LiR9PR0fvzxRwCMMYwePZr+6oQXkTCiRO9WYVbRWxt2IR/W\n+vXrSUtLY8WKFQDExMQwZswYrr76aocjExGpGCV6twqz6+A7d5aeKCcxERo2dCaeqli7di2pqan8\n9ttvAMTGxvL+++9z+eWXOxyZiEjF6T56twqzkW2+zktSUiAmDN9htWrVon79+gDEx8fz4YcfKsmL\nSNgKw4/hKFDWdXAXV/SR1D9fs2ZNZs+ezbnnnsukSZPo0aOH0yGJiFSaLt270fbtsG9f8bakJCiq\nMt0okvrnAWrXrs0nbxysxwAAHRNJREFUn3yCcfHtjCIi/lBF70ZlZU0XJ51wrui/++473nnnnVLt\nSvIiEglU0btRmPXPQ/hW9EuXLqVDhw5s27YNgH79+jkbkIhIgKmid6Mw65+H8KzolyxZQlpamjfJ\n33XXXezYscPhqEREAkuJ3o3CMGuGW0X/2WefkZGRwc6dOwFPn/zcuXM50sVrCYiIVIYSvRuFWdbc\nvRtKFsJxcXDUUc7EU55FixbRsWNH9hTd+F+3bl2ys7Np27atw5GJiASeEr0bhVlF7yvcZs0gNjb0\nsZQnKyuLTp06sa/oroYGDRqwYMECTjvtNIcjExEJDiV6Nwqzij5czktmz55Nly5dyMnJAaBx48Ys\nXLiQk046yeHIRESCR4nebXbuhF27irclJECjRs7E44dwGDs4bdo0unXrxoEDBwBo2rQpCxcu5Pjj\nj3c4MhGR4FKidxtf5bHL55INh7sBc3NzKSgoAKB58+YsWrSIli1bOhyViEjwuTd7RKtwKI9LCIeQ\ne/XqxdixY2nVqhULFy7kmGOOcTokEZGQUKJ3m3Aoj0sIl5CvvPJKli1bRkpKitOhiIiEjBK924RD\neVyCG0P+4IMP+OOPP0q1JyYmOhCNiIhzlOjdJlzK4yL79kHJfBoTA02aOBMPwKhRo7j88svJyMjw\nznonIhKtlOjdxo3l8WGsWVO6rWlTiI8PfSwAzz//PAMHDgRg2bJlDBo0yJlARERcQonebcKsonfT\nLf/Dhw/nrrvu8j7++9//zr/+9S9nghERcQklejfZuxdKXmp281yyuGeynMcee4z777/f+/j8889n\nzpw51K5dO/TBiIi4iBK9m/jKmk2bepK9Szld0VtrGTJkCEOHDvW2XXjhhcyaNYuaNWuGLhAREZdy\nbwaJRmHWPw/OVvTWWu677z6effZZb1uHDh346KOPSE5ODk0QIiIup0Qfavv2wZIlULRyWjFz5pRu\nc2Gi37gRvvkGCgpg2bLSz4ciZGstd955Jy+++KK3LTMzk//+979Uq1Yt+AGIiIQJJfpQ+v576NAB\nNm3y/2dcNhDvzTfhppugsLDsbUIRcm5uLkuXLvU+7tatGxMmTNB98iIiJSjRh9LDD1csyYOrKvr9\n++Guuw6f5I3xLFEbbNWqVWP69OlcdNFFNG3alPfff594p+7pExFxMSX6UPrss4r/TOvWgY+jkpYv\nh927D7/NMcdAqIrqGjVqMGfOHJKSkohz8YBFEREnadR9qBw4UPFq/tRT4eyzgxNPJfgaeFfSzTcH\n59h5eXnMmjWrVHuNGjWU5EVEDkOfkKHiawq55GRITy/dbgycfLLnOrmLlqf1dVNAixZw4omeX6Vz\nZ+jbN/DHPXjwIFdccQWTJk3ilVde4ZZbbgn8QUREIpQSfaj4ypKnnw5Tp4Y8lMry9SvcdBPcd1/w\njpmbm0vv3r2ZNm0aAAMGDOCUU07h3HPPDd5BRUQiiBJ9qITZ1La+hPpXyMnJoUePHsw55LbDu+66\ni3POOSd4BxURiTDuuS4c6cJwMpySQvkr7Nu3jy5duhRL8oMHD2bEiBEYY4JzUBGRCKREHyphXtFb\nG7pfYc+ePXTu3Jn58+d724YNG8YTTzyhJC8iUkG6dB8qYV7R79xZ+ta6xERo2DCwx9m1axedO3fm\n888/97Y9+eSTDB48OLAHEhGJEkr0oRLmFb2v8FNSAntTwI4dO7jooov48ssvvW0jRozg7rvvDtxB\nRESijBJ9KBw8COvXl25PSQl9LJUUigsS33zzTbFpbUeOHMltt90W2IOIiEQZ9dGHwrp1nk7uQzVs\nCElJzsRTCaG4IJGens748eOJj4/ntddeU5IXEQkAVfShEOb98xC6X+HSSy/ll19+4egw6tYQEXEz\nVfSh4OSi7QESjF9h7dq1bNy4sVS7kryISOAo0YeCr3I4zJJZoH+FVatW0b59e9LT09myZUvldyQi\nIoelRB8KquiL+fXXX2nfvj2///47P/74I5mZmRQebu1bERGpNCX6UAjzin73bti+vXhbXBw0blzx\nff3888+0a9eONUWL/CQmJvLoo48S46LFe0REIonjn67GmE7GmJ+NMSuNMff7eP4uY8xyY8wyY0yW\nMSZ8MuSfwnwwXln30MfGVmw/y5cvp3379mzYsAGAatWqMXXqVDIzMwMQpYiI+OJoojfGxAKjgM7A\nCcAVxpgTSmy2FGhrrT0FmAg8E9ooqyg/33N7XUlhVNEH4ta6ZcuWkZqayubNmwFITk5m5syZdOzY\nMQARiohIWZyu6M8CVlprf7PWHgTGA90O3cBam22t3V/0cDHQNMQxVs369VBQULytXj34//buPjqq\n8trj+HdDSECpIIooYIBapNRai7KsWr3ACAURoSrVaFGkXPXSRRdq0bay2nr1Vmu5otel2MqtglhR\nUo3FokUUAr0WWCpUW99TUQKUSoFAFRFC9v1jDmGSTMIkc+b991krKzPPeebMnsfgnv2c55xz+OGZ\niacNkp2QWLt2LcOGDWPr1q0AdO7cmSVLljBs2LBQ4hMRkeZlOtH3Aqpjnm8M2pozGXgupRGFLccv\nfQvJfYQ1a9YQiUTYHhzk79KlC0uXLuXss88OMUIREWlOzlwwx8wmAIOBIc1svwa4BqA0my4tm+PH\n5yG5jzBz5kx27twJwJFHHsnSpUs57bTTQotNRERalumKfhNwfMzz3kFbA2Y2HJgBjHX3z+LtyN0f\ndPfB7j64e/fuKQm2TQq8on/kkUcYOnQoRx99NMuXL1eSFxFJs0xX9C8D/c2sH9EEXwZcHtvBzAYB\nvwJGuXvuXVmlwCv6ww47jGeeeYZNmzYxYMCAMMMSEZEEZLSid/daYCqwBHgLWOjub5jZrWY2Nug2\nE+gMlJvZn81sUYbCbZscr+h374ZgDV29du2gVzMrKd5///0mbZ07d1aSFxHJkExP3ePuz7r7ie5+\ngrv/LGj7ibsvCh4Pd/ce7v7V4Gdsy3vMMjle0cf7ntK7N3To0LT9d7/7HQMHDuTuu+9OfWAiIpKQ\njCf6vFZXB8EV4BrIoYo+0UvflpeXM378ePbu3csNN9zA3LlzUx2aiIgkQIk+lf7+d9i3r2Fb167Q\npUtm4mmDRK7e+9hjj1FWVkZtbS0AJ5xwApFIJPXBiYjIISnRp1IB3Mxm3rx5TJgwof6mNAMGDGDl\nypXZdYqjiEgBU6JPpRy/mQ20/BHmzJnDpEmTcHcATjrpJFasWEHPnj3TF6CIiLRIiT6VcnwhHjT/\nEe6//36uueaa+iR/yimnsHz5cnr06JHW+EREpGVK9KmU46fWQfyP8OKLs5g6dWr988GDB7Ns2TKy\n6kJFIiICKNGnVo5X9Hv2RNcTNvQBs2bdXP/sjDPO4IUXXqBbt25pjU1ERBKjRJ9KOV7RV1c3bevZ\nsy9PPfUUHTp04JxzzuH555+nSw6dRSAiUmgyfQnc/OWe84m+uYV4o0ePZunSpQwePJjDc+h2uyIi\nhUiJPlU++ig69x2rc2fIoSnu6PcUB3YA0bgPHHkYMiTuTQRFRCTLaOo+VZo7Pm+W7kjabP16B64D\nTgc2Azm1xEBERFCiT50cn7avq6ujvHwKcC/wNyAC/DOXPoKIiKCp+9TJ4RX3+/fv5+qrr+a99x6O\naT0F6JIrH0FERAJK9KmSoxV9bW0tkyZN4tFHH41pnQA8DBQp0YuI5Bgl+lTJwYp+3759TJgwgYUL\nF8a0TgLmAO0B0CXsRURyixJ9quRYRb93717KysqoqKiIab0WmM2BpRw9ekCnTpmITkRE2kqL8VLB\nPacq+j179nDRRRc1SPIXXvg94AFi/0Sy+HuKiIg0Q4k+FbZtg08+adjWqRNk6bXgZ86cyeLFi+uf\nT58+nTFj/gdoeCpgln5PERGRFijRp0Jz0/ZZeg79jTfeyMiRIwG4+eab+cUvfsGGDU1jVUUvIpJ7\ndIw+FXLsPvQdO3akoqKC8vJyrrjiCswsl448iIhIC1TRp0K8ij6LsuSexpfmBTp16sSVV16JBbMO\nObaWUEREmqFEnwpZXNFv376ds88+m9tvv73FfqroRUTyg6buUyFLK/qtW7cyYsQIXnvtNV599VWK\ni4uZPn16k361tbBxY9PXZ8l3FRERaQUl+lTIwop+y5YtDB8+nDfeeAMAM6Nr165x+27eHE32sY46\nKnrzPRERyS1K9KmQZRX95s2biUQivPPOOwC0a9eOhx56iIkTJ8btr+PzIiL5Q4k+bDU1sHNnw7bi\nYjj22IyEU11dTSQSoaqqCoD27dszf/58LrvssmZfo+PzIiL5Q4k+bPHK4dJSaJf+dY/r168nEonw\nQZC5i4qKWLBgAePHj2/xdVk2ISEiIklQog9blhyfr6qqIhKJUF1dDUCHDh0oLy9n3Lhxh3xtlnwE\nEREJgRJ92LKgHK6treW8886rT/IlJSVUVFRw3nnnJfR6Td2LiOQPnUcftiwoh4uKinjggQcoKSmh\nU6dO/P73v084yYMW44mI5BNV9GHLgooeYPjw4Tz99NN07NiRoUOHJvy6ujrYsKFpuxK9iEhuUqIP\nW4bmvevq6mjXaMHfqFGjWr2fLVtg796GbV26QDOn3IuISJbT1H3YMjB1v3r1agYNGsSH8WYTWknH\n50VE8osSfZj+9S/Yvr1hW1ER9OyZsrf84x//yIgRI3j99deJRCJsjHft2lbQ8XkRkfyiRB+meFmy\nd+9osk+BZcuWMWrUKD7++GMAdu3axY4dO5Lapyp6EZH8okQfpjQuxHv++ec5//zz2b17NwA9evSg\nsrKSk08+Oan9qqIXEckvSvRhStPx+cWLF3PBBRfU31e+Z8+erFixgpNOOinpfauiFxHJL0r0YUpD\nRV9RUcGFF17I3mBpfGlpKStXrmTAgAGh7F8VvYhIftHpdWFKcUW/cOFCLr/8cvbv3w9Av379WL58\nOX1a8R61tVBZCe++G3+7KnoRkfyiRB+mFFb0lZWVXHbZZdTV1QHQv39/li1bRu/evRPeR10dfPOb\nsHhx4u/buTN069baaEVEJFto6j5MKazozzrrLMaMGQPAwIEDWbFiRauSPMDKla1L8hAN36x1rxER\nkeyhRB+WTz+Fjz5q2GYWPb0uBMXFxSxcuJBp06ZRWVnJcccd1+p9vPpq6983hPV9IiKSQZq6D0u8\naftevaC4OLS3KCkp4Z577mnz6+NNOLSkuBimTm3z24mISBZQog9LyMfn77rrLmpqarjtttvaHlMj\n8UIcNQr69Wva3q0bXHwxDBoU2tuLiEgGKNGHJcTj87fffjszZswAolP2P/7xj5MI7KB4If70p3DG\nGaHsXkREspCO0YclhPPS3J1bbrmlPskDvPDCC/XnzCfDXafOiYgUIlX0YUnySjPuzowZM7jjjjvq\n2yKRCIsWLaI4hOP8NTXRe+7EKimBY45JetciIpLFlOjDkkS57O5Mnz6dWbNm1beNHDmSiooKOnXq\nlLLw+vSBdprTERHJa0r0YWljRV9XV8e0adO477776tvGjBlDeXk5HTt2zHR4IiKS45Tow/DZZ7B5\nc9P20tIWX1ZXV8eUKVN48MEH69suuugiFixYEMp0fSwdnxcRKUyauA1DdXXTtmOPhUNU5NOmTWuQ\n5MvKynj88cdDT/Kgil5EpFAp0YehjeXyJZdcwmGHHQbAFVdcwfz58+nQoUO4sQVU0YuIFCZN3Yeh\njRfLOeecc3jmmWd48sknuffee2nfvn34sQVU0YuIFCYl+jAkcbGcSCRCJBIJN544VNGLiBQmTd2H\nIYGKfs+ePXznO9+hqqoqPTHF2LULduxo2FZUBG24L46IiOQYJfowHKKi3717N2PHjuXhhx8mEonw\nQWvvLpOkeN9DSkshhUcKREQkSyjRh6GFefGPP/6Y888/n6VLlwJQXV1NRUVF+mIj1Mvwi4hIjtEx\n+mTt2webNjVtLy1l165djB49mpdeeqm++dZbb+X6669PY4Ch31hPRERyiBJ9sjZuhLq6hm3du1Oz\nbx+jRo1izZo19c133nknN910U5oDVEUvIlLIlOiTFadc3tarF98491zWrl1b33b33Xdz3XXXpTOy\neqroRUQKlxJ9shqVyx8BI9av5/WdO+vbZs+ezZQpU9IbVwxV9CIihUuJPlkx5fI2YBjwZpDkzYw5\nc+YwefLkzMQWUEUvIlK4tOo+WTHlchfgpOBxu3btmDdvXsaT/CefwNatDdvatYNevTITj4iIpJcq\n+mTFlMtFwG8Avv51Lpo6lbKyskxFVW/DhqZtvXtDii6pLyIiWUaJPlmNDoB3AJ6YPRv7ylcyEk5j\nOj4vIlLYNHWfhPfefpufffgh3qjdsugAuI7Pi4gUNlX0bfTWW28RGTKELXV1fAL8DDCAI4+EI47I\nbHAxVNGLiBQ2VfRt8Je//IUhQ4awJVjldg+w/sDGLCuXddc6EZHCpkTfSuvWrWPYsGFsDZL84cBz\nwOcPdMiycln3oRcRKWxK9K3w8ssvE4lE2LZtGwBHlJTwPDAktlOWlcuq6EVEClvGE72ZjTKzd8ys\nysx+GGd7iZk9EWxfY2Z90x8lrFq1iuHDh1NTUwNA165dWTpyJGc17phF5fKePbBlS9P2449Pfywi\nIpIZGV2MZ2btgfuBEcBG4GUzW+Tub8Z0mwzscPcvmFkZcCdwaapj2/lhDeuu/SUAr21fz4/WzuXT\n/XsBOKJDJ+4cOIETVi1t8ronX+3Lez9PdXSJCb6TNNCzJ5SUpD8WERHJjEyvuj8dqHL39wHM7HFg\nHBCb6McBtwSPfwvcZ2bm7o3PagvVzvXbGbrkRywDbgY+Ddq7Ay/u+5STV90X93X/9Wgf/pzKwJKU\nRRMOIiKSBpmeuu8FVMc83xi0xe3j7rXATuCotEQHdAbaB4+PBSqBk1vo/wF9UxxRcnR8XkSksGQ6\n0YfGzK4xs1fM7JWtjS/unoTTgT8AXwRWAF9qoe9GelFD19DeOxVObulbioiI5J1MJ/pNQOzSsN5B\nW9w+ZlZE9N4x2xrvyN0fdPfB7j64e/fuoQZ5FvBX4MRD9Ludmwkum5OVevaEiRMzHYWIiKRTpo/R\nvwz0N7N+RBN6GXB5oz6LgInAKmA8sCzVx+cBOvfqQuXXfpBQ39qijvytdBhHlA4hsVekX69eMH48\nHHdcpiMREZF0ymiid/daM5sKLCF6KPwhd3/DzG4FXnH3RcCvgflmVgVsJ/plIOW69T+KoasTXz4/\nPIWxiIiItFWmK3rc/Vng2UZtP4l5vAf4VrrjEhERyQeZPkYvIiIiKaRELyIikseU6EVERPKYEr2I\niEgeU6IXERHJY0r0IiIieUyJXkREJI8p0YuIiOQxJXoREZE8pkQvIiKSx5ToRURE8pgSvYiISB5T\nohcREcljSvQiIiJ5TIleREQkj5m7ZzqG0JnZVuDDEHd5NPDPEPdXqDSOydMYJk9jmDyNYfLCHsM+\n7t493oa8TPRhM7NX3H1wpuPIdRrH5GkMk6cxTJ7GMHnpHENN3YuIiOQxJXoREZE8pkSfmAczHUCe\n0DgmT2OYPI1h8jSGyUvbGOoYvYiISB5TRS8iIpLHlOhjmNkoM3vHzKrM7IdxtpeY2RPB9jVm1jf9\nUWa3BMbwBjN708xeN7MXzaxPJuLMZocaw5h+F5uZm5lWP8eRyDia2SXB3+MbZvZYumPMdgn8ey41\ns+Vmti74Nz06E3FmKzN7yMw+MrO/NrPdzOzeYHxfN7NTUxKIu+sneviiPfA34PNAMfAa8KVGfb4L\n/DJ4XAY8kem4s+knwTEcBhwWPJ6iMWz9GAb9PgesBFYDgzMdd7b9JPi32B9YBxwZPD8m03Fn00+C\nY/ggMCV4/CXgg0zHnU0/wL8BpwJ/bWb7aOA5wIAzgDWpiEMV/UGnA1Xu/r677wUeB8Y16jMOmBc8\n/i1wrplZGmPMdoccQ3df7u67g6ergd5pjjHbJfJ3CHAbcCewJ53B5ZBExvFq4H533wHg7h+lOcZs\nl8gYOnBE8LgLsDmN8WU9d18JbG+hyzjgEY9aDXQ1s+PCjkOJ/qBeQHXM841BW9w+7l4L7ASOSkt0\nuSGRMYw1mei3WTnokGMYTO8d7+6L0xlYjknkb/FE4EQze8nMVpvZqLRFlxsSGcNbgAlmthF4Fvhe\nekLLG639f2abFIW9Q5FEmNkEYDAwJNOx5BIzawfMAq7KcCj5oIjo9P1QojNLK83sZHevyWhUueUy\nYK6732VmZwLzzezL7l6X6cDkIFX0B20Cjo953jtoi9vHzIqITlVtS0t0uSGRMcTMhgMzgLHu/lma\nYssVhxrDzwFfBirN7AOix/UWaUFeE4n8LW4EFrn7PndfD7xLNPFLVCJjOBlYCODuq4CORK/hLolJ\n6P+ZyVKiP+hloL+Z9TOzYqKL7RY16rMImBg8Hg8s82BFhQAJjKGZDQJ+RTTJ65hoUy2OobvvdPej\n3b2vu/clus5hrLu/kplws1Yi/56fJlrNY2ZHE53Kfz+dQWa5RMZwA3AugJkNJJrot6Y1yty2CLgy\nWH1/BrDT3f8e9pto6j7g7rVmNhVYQnS16UPu/oaZ3Qq84u6LgF8TnZqqIrrAoixzEWefBMdwJtAZ\nKA/WMW5w97EZCzrLJDiGcggJjuMS4Btm9iawH7jR3TVDF0hwDL8PzDGz64kuzLtKxc9BZraA6JfJ\no4N1DD8FOgC4+y+JrmsYDVQBu4FJKYlD/01ERETyl6buRURE8pgSvYiISB5TohcREcljSvQiIiJ5\nTIleREQkjynRi0iLzOyq4C55V2U6FhFpPSV6EclKZlZpZjr/VyRJumCOiBxKBdEr8IV+xS4RST0l\nehFpkbvvJHqnRhHJQZq6FykwZtY3OOY+18y+aGZPm9l2M/vEzP7PzL7RqH+DY/Rm1tHMaszso+Dm\nTvHe44HgNWMatZ9rZn8I3u8zM3vXzH5uZl0ax0dwZ8NgPwd+KkMeDpG8p0QvUrj6AauAbkRvNFQO\nnAY8Z2aXNvcid98DPAF0B85rvN3MSoBLgX8Af4hpvxZYCnyd6A1l7iZ6z4gfAH8ys65B1xrgP4EP\ng+f/GfMzt02fVKSA6Vr3IgXGzPoC64On/+3uN8ZsG0w0+X8M9HH3XUEl/zAwyd3nBv3OBP4EPOnu\n4xvt/1tEb106y92/H7T1IXob2M+A09397Zj+s4EpwBx3vyamvRIY4u4W1mcXKUSq6EUK107g1tiG\n4Ha3vwG6Ahc298Lg3uPvAheYWbdGmw/cynleTNsEoBi4LzbJB2YA/wKuCGYDRCRESvQihWutu/8r\nTntl8HvQIV4/j2jyrr9ds5n1AEYC69z99Zi+pwa/lzXeibvvANYRvZf5FxOKXEQSpkQvUrj+0Uz7\nluB3l2a2H/AIUMfBCh7g20TP5pnXqO+BfTV3it6B9q7NbBeRNlKiFylcPZppPzb43eIpde6+kWiF\nfrqZHajEJwL7gMcadT+wr2OJ77hE3lNEWk+JXqRwnWpmn4vTPjT4vS6BfcwNfk80s68CXwGec/et\njfod2NfQRu0Eq+2/CuwB3orZtD/Y3j6BOESkGUr0IoWrC/CT2IZg1f23iVbWFQns4ylgF9HFdlcF\nbXPj9HuUaKX/PTP7QqNttwFHAI+6+2cx7duC36UJxCEizdCV8UQK10rg383sa8BLRKfPLyVaAFzr\n7rsOtQN3/9TMyoHJwHeJJufFcfp9YGbXAfcDa81sIbCV6EVxzgTeJno+fawXgW8BT5nZs8CnwIfu\nPr8tH1akUKmiFylc64GzgB3AfwCXAGuB0e7+RCv2Mzf43QFY4O5743Vy99lEV+SvBi4GbgCOAWYC\nZ7r79kYv+V/gDqIzDzcRrfwntyIuEUEXzBEpODEXzJnn7ldlNBgRSTlV9CIiInlMiV5ERCSPKdGL\niIjkMR2jFxERyWOq6EVERPKYEr2IiEgeU6IXERHJY0r0IiIieUyJXkREJI8p0YuIiOSx/wdSwEuL\nP6kZDwAAAABJRU5ErkJggg==\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfoAAAHpCAYAAABqV/58AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3hURRfA4d+kFyB0kCC9qIiKRCwI\nhN4hNAVURJSmIPKBKCBVFCmKonQUQVEQpHcIRUEQUcSCSpEivfdA2nx/3CUku5tkk2zf8z5PHtiz\nd+8eIOTsnTtzRmmtEUIIIYR38nN1AkIIIYRwHCn0QgghhBeTQi+EEEJ4MSn0QgghhBeTQi+EEEJ4\nMSn0QgghhBcLcHUCjlCwYEFdqlQpV6chhBBCOMXPP/98TmtdyNpzXlnoS5Uqxa5du1ydhhBCCOEU\nSqkj6T0nQ/dCCCGEF5NCL4QQQngxKfRCCCGEF5NCL4QQQngxKfRCCCGEF/PKWfe2uHLlCmfOnCEh\nIcHVqYhsCAwMpHDhwuTJk8fVqQghhFvzyUJ/5coVTp8+TWRkJKGhoSilXJ2SyAKtNXFxcRw/fhxA\nir0QQmTAJ4fuz5w5Q2RkJGFhYVLkPZBSirCwMCIjIzlz5oyr0xFCCLfmk4U+ISGB0NBQV6chcig0\nNFRuvQghRCZ8stADciXvBeTfUAghMuezhV4IIYTwBVLohRBCCC8mhd5DDR8+nIIFC7o6DYf5448/\nUEqxefNmV6cihBAeTQq9EEII4cVcWuiVUp8ppc4opf5I53mllJqolDqglPpNKfWws3MUaSUkJJCU\nlOTqNIQQQtjI1Q1zPgc+Aeak83xjoLzp61FgiulXkYkLFy7w5ptvsnTpUi5fvszDDz/MhAkTePTR\nO39977//PvPmzWPfvn2EhIRQrVo1JkyYQLly5VKOiY6OpmDBgjRo0IAxY8Zw+PBhDh8+zKeffson\nn3zC+vXr6dmzJ7/99hsVK1Zk4sSJ1KhRI00uM2fOZMKECRw4cICiRYvyyiuvMGDAgDTHTJ48mdGj\nR3PhwgXq1KnDq6++6ti/ICGE650+DUOGwM8/g5deQMTdNP6Yt25af/7UvbWp9csEh+bg0kKvtf5O\nKVUqg0NaAnO01hrYoZTKq5S6S2t90p55uNMqLa1zfo5bt25Rr149Ll26xLhx4yhcuDBTpkyhXr16\n7N+/n6JFiwJw7NgxevXqRcmSJbly5QpTp07liSeeYP/+/URERKScb9u2bRw8eJAxY8YQFhaW8tyN\nGzd4/vnn6du3L0WLFmXEiBG0bt2aI0eOEBYWBsC4ceMYNGgQAwYMIDo6mp9//pkhQ4YQFhZGr169\nAFi6dCmvvPIKPXr0ICYmhi1bttClS5ec/0UIIdzXjRtQuzb89ZerM3GoUKBUqsdxwFmghOnxxVOl\nHZ+E1tqlX6a/gz/SeW4F8GSqx7FAVGbnrFq1qs7I3r170zw2yqt7fNlq2LBhukCBAlafmzlzpg4M\nDNT79u1LiSUkJOgyZcro/v37W31NYmKivnHjhs6VK5eePXt2SrxWrVo6JCREnzp1yuL9AR0bG5sS\n2717twb06tWrtdZaX758WYeHh+vhw4enee2QIUN0kSJFdGJiotZa60ceeUQ3atQozTEvvfSSBvSm\nTZsy/Hsw/7cUQniIN990/Q9cJ39dA10HdAnQh02x7XfF2OWvE9il06mJXjMZTynVTSm1Sym16+zZ\ns65Ox6U2bNhA1apVKV26NImJiSQmJgJQq1Ytdu3alXLcjh07qF+/PgUKFCAgIICwsDCuXbvGvn37\n0pyvatWqFClSxOJ9goKCiI6OTnl83333AcZIAcD27du5fv067dq1S8kjMTGROnXqcPr0aY4dO0Zi\nYiK//PILLVu2THPu1q1b2+XvQgjhhn7/HcaPd3UWTnUV4170RuAoUBu44qT3dvU9+swcB+5O9bi4\nKWZBaz0dmA4QFRVlhwFwz3Xu3Dl27NhBYGCgxXNly5YF4OjRozRo0IBq1aoxbdo0ihUrRlBQEE2b\nNuXmzbQ3k6wVeYDcuXPj53fns2JQUBBAyuvPnTsHQKVKlay+/r///iM4OJikpCQKFy6c5jnzx0II\nL5GcDN27g+kCxBdcxijy21PFugLO2o7L3Qv9MqCXUmoexiS8y9rO9+e9Uf78+YmKimLKlCkWzwUH\nBwOwZs0abty4wdKlSwkPDwcgMTGRCxcuWLwmu61m8+fPD8CKFSusflioWLEioaGh+Pv7W2xOI5vV\nCOGlZsyA7dst4/36wbPPOj8fB3j3XfhmgfH7RC7zLy8Tx96U59947n+0bvoc/wCRd0VYP4kdubTQ\nK6W+BqKBgkqpY8AwIBBAaz0VWAU0AQ4AN4AXHJGH9rLr/7p167Ju3TpKlCiR7pVxXFwcfn5+BATc\n+Rb45ptvUob57eHxxx8nNDSUEydO0LRp03SPq1KlCkuXLqVHjx4psUWLFtktDyG81uXL8MMPcO2a\nqzOxTWIivPGGZbxCBXjnHTBdiLhSXJzxV2rlmscmZ8/CWwvBKCvnMMrWnSL/8ccfp0xEdhZXz7rv\nkMnzGnjFSel4nPj4eBYuXGgRb9y4MVOnTiU6Opr+/ftTpkwZzp8/z86dOylatCh9+/alTp06JCUl\n8cILL/Diiy/y559/Mn78ePLmzWu3/PLmzcvw4cPp06cPR44coWbNmiQnJ7Nv3z42bdrE4sWLARg0\naBCtW7emZ8+etGrVii1btrBmzRq75SGEV9q5Exo2hEuXXJ1Jzk2b5hZF/vRpqFUL/vnHLmcD6gF3\n2sRMmzaNbt262ePkWeLuQ/ciA1evXqVdu3YW8U2bNrFp0yaGDh3KsGHDOH36NIULF6ZatWq0aNEC\ngMqVK/P5558zfPhwFi9ezIMPPsiCBQt4+umn7ZrjgAEDKFasGBMmTOD9998nJCSEChUqpHmfVq1a\n8fHHH/Pee+8xe/ZsoqOj+fTTT2nYsKFdcxHCa1y/Du3aeUeR79wZUk3qdaUxY+xV5JOBZtwu8kop\nPv30U154wSGD0plS2tvGrTEm46WeXW7ur7/+4t5773ViRsJR5N9S+KT+/eH9912dRc4VLAh//w0F\nCrg6E7SGu++G41ane2fHRqApEM+cOXN47rln7HViq5RSP2uto6w9J1f0QgjhSXbvhg8/dHUWOefn\nB1OmuEWRB6M5n/2KPEAdQkOXMmLEZZ57znLk1Zmk0AshhKdISjKWppm3iw0JgWbNXJNTduTPD089\nBXXrujqTFEuWWMZKloRHHrHt9VrrNCuUiheHLl0aULmynRLMASn0QgjhKSZPhp9+sowPHQoDBzo/\nHy9irdC/8Qb07Jn5a//55x+6dOnC3LlzKVWqlN1zyykp9EII95eUBCNHwvLlnrOUzBGOHrWMVapk\nrEEX2bZ/P/z5p2XcNHc5Q3v37k3p9lmnTh22bNnC3XffnfkLnUgKvRDC/fXubdzPFZamTQNTV0qR\nPdau5qtVg8jIjF/322+/Ua9ePW63XT99+jSHDx92u0LvNb3uhRBe6rvvpMinp1s3qF7d1Vl4PGuF\nPiYm49f88ssv1K5dO6XI58qVizVr1lhs0+0OpNALIdzXrVvG5DNhqXBheO89V2fh8U6dst6RN6NC\nv3PnTurWrZvSMjwiIoL169e7ZZEHKfRCCHc2bpyxzlqkVbw4rFoF+fK5OhOPt3y5ZRv0ihUhvfYc\n27Zto169elwyNSvKly8fsbGxPPbYYw7ONPvkHr0Qwj3t2wejRlnGH30UZs+GbG625PGCgox1X776\n57ezrAzbb968mWbNmnH9+nUAChYsyIYNG3jwwQcdmGHOSaH3UMOHD2fEiBE0aNCAtWvXpnmubdu2\nnDt3js2bN9t0rsOHD1O6dGmWL19OM09aiys8y8GD8OOPkJBg2/EzZxpD96n5+8P06cYllxA5dPUq\nbNhgGbdW6Hfv3k2TJk2Ii4sDjO27Y2Nj092G251Iofdw69at46effuIRW7s6WHHXXXexfft27rnn\nHjtmJkQqU6fatiA5M/36wQMP5Pw8QgBr1kB8fNpY0aLGjHtz999/Pw0bNmTJkiUUK1aMjRs3UtFD\nPnDKPXoPlj9/fipXrsw777yTo/MEBwfz2GOP2XXnOiFS7NkD9tiWs1QpGDYs5+cRAkhOhgkTLOMt\nWxrdec0FBgYyb948unbtypYtWzymyIMUeo+mlGLw4MEsW7aM33//3eoxJ0+epEuXLpQpU4bQ0FAq\nVKjAW2+9RXyqj7GHDx9GKcWKFSsA6Ny5s9URgkmTJhEWFsbVq1cBSE5O5r333qNcuXIEBwdToUIF\nZs+e7YA/qfBY6bVszY7JkyEsLOfnEQLjDlBWZ9sHBwczffp0ypUr57jEHEAKvYdr164d5cuXT/eq\n/ty5c+TPn58PPviANWvW8PrrrzNr1ix69+6d7jmffvppdu3axaFDh9LE58+fT5MmTcidOzcAvXv3\nZtSoUXTr1o2VK1fSqlUrunTpkvKBQQimTjXuy+dU9+7QuHHOzyMEcPIkvPmmZfyee6BePeP3X331\nFUOGDMEbdniVe/TgXrNXs/hN5efnx8CBA3nxxRcZOXIkFSpUSPN85cqVGT9+fMrj6tWrEx4eTpcu\nXfj4448JstJRq379+hQoUID58+fzpul/w/Hjx9m6dSvffPMNAAcOHGDKlCnMmjWL559/HoB69epx\n8uRJRowYIZP6BJw4Yb3/esmSUKuWbecICoInn4TnnrNvbsKn9e0Lly9bxqdNg4AAmD17Ni+88AJa\nawIDAxk6dKjzk7QjKfRe4Nlnn2XEiBGMHj2aWbNmpXlOa81HH33E9OnTOXToEDdv3kx57ujRo1aH\noAICAmjdunWaQr9gwQLCw8Np2rQpALGxsfj5+dGqVSsSExNTXlu3bl2+/vprkpKS8Pf3d8QfV3iK\nPn2Mac3mvvzSKN5CuMDq1TB/vmW8SxeoWRNmzJhB9+7dU67kFyxYQL9+/QgPD3dypvYjQ/deICAg\ngAEDBvDll19y5MiRNM99+OGH9O/fn1atWrF06VJ27tzJpEmTANIUfXPt27fn119/Zd++fYAxbN+i\nRQtCQ0MB45ZAUlISERERBAYGpnx17tyZxMRETp486aA/rfAIK1bAwoWW8a5dpcgLh7l+3fh8WaEC\n3H239a82bSxfV7AgjB1rzEPq1q1bSpF/6KGH2LRpk0cXeZAreq/RpUsXRo0axZgxY9LEFyxYQNu2\nbdPcw9+7d2+m56tVqxZFihRh/vz5dOrUiR07djAw1TBs/vz5CQgIYNu2bfhZmaJauHDhHPxphEdL\nTIRXX7WMS8tW4WBvvAGm65gsmTABZs/+gH6pdgGMiopi7dq15M+f344ZuoYUei8RHBxM//79GThw\nIFWrViUwMBCAuLg4goOD0xw7d+7cTM/n7+9Pu3btmD9/PiEhIeTNm5dGjRqlPF+nTh2SkpK4fPky\n9evXt+8fRni2778Hs4mcgPHT1At+aAr3FBcHZncubVK3Lvz333sMGnTnQuaxxx5jzZo1RERE2DFD\n15GhezAmwLnLVw50796d3Llz88MPP6TE6tevz/z585k8eTJr166lU6dOHDhwwKbzPf300/z5559M\nmDCBmJiYNBP3KlasSI8ePWjfvj1jxowhNjaWlStXMnbsWF566aUc/TmEh7PWU7R2bejQwfm5CJ+x\nYQPcuJG11wQFaSpVGpmmyNeoUYN169Z5TZEHKfReJSwsjL59+6aJDR06lA4dOvDWW2/RoUMHgoKC\nmDhxok3nq169OnfffTcnT56kffv2Fs9PmjSJIUOGMGfOHJo0aULnzp1ZuXIlNWvWtMufR3ggra0X\n+i5d3Gt1i/A61r7tMlKkCLRv/w4TJ95pwlS7dm1Wr16dsoTYWyhvWCNoLioqSu/atSvd5//66y/u\nTW9rIuFR5N/SzezeDQ8/nDYWEABnzshOa8JhkpKM1rXnzqWNf/bZnXXxqfn5QbFisG3bVho1asT1\n69dp0KABixcvJsxDmzIppX7WWkdZe07u0Qsh7MfaZVV0tBR54VA//GBZ5END4emnM26m+OSTT7Ji\nxQomT57MnDlzCAkJcWyiLiKFXghhP4sXW8Yy6ikqhB1Y+3zZoIFtHZOjo6OJjo62e07uRO7RCyHs\n4+BBsLbnQosWzs9F+Iz0poWYf75MSkpiwIAB7N+/3zmJuREp9EII+1i61DIWFWV0KRHCQX7/Hf79\nN23Mzw9Sd+FOTEykc+fOjBs3jjp16ljs4+HtpNALIezDlssqIezM2rddjRpGtzuAhIQEnnnmGb78\n8ksAjh07xvTp052Yoev57D16rTVKlvt4NG9cMeKxzpyBbdss41LohYNl9PkyPj6e9u3bszjV3JHu\n3bunu9unt/LJK/rAwEDi4uJcnYbIobi4uJQOgMLFli+H5OS0sXLl4L77XJOP8AlHjhgrOs3FxBh7\nebRu3TpNke/duzdTpkyx2rbbm/nkFX3hwoU5fvw4kZGRhIaGypW9h9FaExcXx/HjxylSpIir0/FM\nX38N48YZPynt4fp1y1hMjDTJEQ5lbVrIQw9B4cI3aNmyFevWrUuJ9+/fn7Fjx/rkz3ufLPR58uQB\n4MSJEyQkJLg4G5EdgYGBFClSJOXfUmTBypXQsaPj36dVK8e/h/Bp1obtmzS5TrNmzdm0aVNKbPDg\nwbz99ts+WeTBRws9GMVeioTwOdeuwcsvO/59ihSBRx91/PsIn3X+PHz3nXn0GqtXN2b37q0pkREj\nRjB06FCn5uZufOtGhRC+bvhwOHrU8e/z7LPg7+/49xE+a8UKo/VtaiVLhlChQmTK49GjR/t8kQcf\nvqIXwufs3g0ffujY9/Dzg+bNYcgQx76P8HnWhu1btQpg7NgvSEhI4Mknn7TY5MtXSaEXwhckJUG3\nbpaXQCEhsGMHFC9un/cJCzOajAvhQDduwNq1lvGYGGP+zoIFC3xuZn1GpNAL4YlOnIB16+DCBduO\n/+svsLaj49Ch8OCD9s1NCAdbvx6MFdKngS+AfhQooKhe3XheinxaUuiF8DS7dxt7b9pa5NNTqRL0\n62efnIRwImPY/gRQF/gbOEezZqMJCPDNWfWZkY89QniS+HhjoltOizzA9OkQFJTz8wjhRImJsGTJ\nf0AtjCIPMJ6HH7ayoZIApNAL4VnGjYO9e3N+nu7d4Ykncn4eIZxs4cJDXLpUEzhgigQQFDSPrl0f\ncGVabk2G7oXwFAcOwNtv5/w8998Po0fn/DxCONmBAwfo2rUO8J8pEggsoGnTljIHNANS6IXwBFpD\njx5w61bauL+/0QAnwMb/yhUqwNNPQ7589s9RCAf6+++/qVu3LteunTBFgoHFQGPZOykTUuiF8ARz\n50JsrGW8b19jOF8IL/bHH39Qr149Tp8+bYqEAsuAevj7p917XliSQi+Eu7lyBXr3Nlp/3bhhxMyv\n5AFKlDA63Qnhxfbs2UO9evU4d+6cKRIOrACiAahVC/Lnd1FyHkIKvRDu5s03Yc6czI+bPBnCwx2f\njxAuFB8fT3x8PAB+frlJTl4NVE95XobtMyez7oVwJzdvwhdfZH5cu3bQtKnj8xHCxR555BFWr15N\nRERxkpPXk7rIA7Rs6Zq8PIlc0QvhTjZuNHaYy0hEhON71gvhRiIjnyA+/gDGBLw7qlc37mCJjMkV\nvRDuxNpOHandf7/R+rZYMefkI4STbd68mb2pekVoDb16QVxc2iKvFLz/vrOz80xyRS+Eu0hKgqVL\nLeNffWXciPTzg+Bgy+eF8BLr1q2jZcuW5M2bly1btlChQgUWLTLmpZp7+WV49FHn5+iJ5IpeCHfx\n449w5kzaWHCwse1raKgUeeHVVq5cSfPmzbl58yanTp2iU6dOXLqk6d3b8ti77oJ33nF+jp5KCr0Q\n7sLasH39+pArl/NzEcKJlixZQqtWrVJm15coUYK5c+cyZIji5EnL4ydONKaqCNtIoRfCHWgNixdb\nxmXtkPByCxYsoF27diQkJABQunRpvvvuO86fL8ukSZbHN2sGbdo4OUkPJ/fohXAHe/cavexT8/Mz\nhu2F8FJz586lU6dOJCcnA1C+fHk2btxIkSLFadnS+PybWlgYfPKJMRFP2E6u6IVwB9aG7atXh8KF\nnZ+LEE4wa9YsnnvuuZQif++997JlyxaKFy/ORx/Bnj2Wrxk5EkqWdHKiXkAKvRDuwFqhl2F74aWm\nTZtGly5d0KZL9sqVK7N582buuusuDh+GYcMsX/PQQ9Cnj3Pz9BZS6IVwtf/+g127LOPS8kt4Ia01\nP/74Y8rjhx56iNjYjRQqVJjkZHjllTtbPNymFEyfbvsmjSIt+WsTwtWWLbOMVa4MZcs6PxchHEwp\nxYwZM7h16xa//rqfxMS1FCmSz+J+fGq9esEjjzgvR28jhV4IV5PZ9sLH+Pv7M3XqbCpUiOPUqdwZ\nHhsZCaNGOSkxLyVD90K40sWLsHmzZbxVK6enIoQjaK1ZuXJlyv3422JjAzIt8gAffwx58jgqO98g\nhV4IV1q50mh9m1qJEsbMIyE8nNaawYMH06xZM1577bU0xT6zbR3AmKYig1s5J4VeCFdKb7a9LBQW\nHk5rTf/+/Rk9ejQAEydO5LPPPgMgMRGWL0//tblzQ8eOMHeu/FewB7lHL4SrxMXBmjWWcbmEER4u\nOTmZPn368Mknn6TEmjVrxjPPPAPA1q1w4ULa14SFwblzxrYOwr6k0AvhKrGxcP162li+fFCjhmvy\nEcIOkpOT6dGjBzNmzEiJtW7dmq+//pqgoCDA+kBWo0ZS5B1Fhu6FcBVrP+2aN5fFwsJjJSUl8eKL\nL6Yp8k8//TTz5s1LKfJaS38oZ5NCL4QrJCVZXz8vP+2Eh0pMTKRTp058/vnnKbHnnnuOL7/8ksDA\nwJTYnj1w5Eja1/r7Q9OmTkrUB0mhF8IVtm+Hs2fTxkJCoEED1+QjRA4kJCTQsWNHvvrqq5RYly5d\nmDVrFgFmI1TWruZr1YL8+R2dpe+SQi+EK1j7adegAYSHOz8XIXLo8OHDbNiwIeXx7Xv0/v7+FsdK\nfyjnk0IvhLPJ3vPCy5QvX57169cTERHBq6++yuTJk/Hzsywv//4Lv/1m+XrZ1sGxZNaPEM5y65Zx\nb37vXuMnXmqy97zwcFWrVuXXX3+lZMmSqHQWvy9dau11Ro8o4Tguv6JXSjVSSv2jlDqglHrTyvMl\nlFKblFK7lVK/KaWauCJPIbLt0CF44gljoXB4uPXdOWrUgIIFnZ+bENlw7do1/vrrL4t4qVKl0i3y\nILPtXcWlhV4p5Q9MAhoD9wEdlFL3mR32FvCN1roK0B6Y7NwshciBuDjj3vv27ZCcnP5x8tNOeIgr\nV67QqFEjatasyR9//GHz6w4eNBrlmJNvfcdz9RV9NeCA1vpfrXU8MA8wv1ujgdtbGkQAJ5yYnxA5\nM2oUHDiQ+XFyk1J4gEuXLtGgQQO2bdvGuXPnqFevHhcvXsz0dVrDyy9bftYtWxYqVXJQsiKFq+/R\nRwL/pXp8DHjU7JjhwDqlVG8gHKjnnNSEyKE//4SxYzM/rm5dKF3a8fkIkQPnz5+nQYMG/PLLLymx\ngQMHki9fvkxfO28erFtnGX/2Well7wyuvqK3RQfgc611caAJ8IVSyiJvpVQ3pdQupdSus+brk4Vw\ntuRk6N7d2L3DXGio8RURYUzAmz3b+fkJkQVnz56lTp06aYr85MmT6dOnT6avvXgRXnvNMl6iBPTv\nb88sRXpcXeiPA3enelzcFEvtReAbAK31diAEsJi1pLWerrWO0lpHFSpUyEHpCmGjTz+Fbdss40OH\nwo0bxtelS0Z3vMhI5+cnhI1OnTpFdHQ0v5nWxSmlmDlzJj179rTp9W+8AWfOWMYnTYJcueyZqUiP\nq4fufwLKK6VKYxT49kBHs2OOAnWBz5VS92IUerlkF+7r9GkYMMAyXr48DBzo/HyEsNGRI/DNN3DC\nNBPq2rXjfPttHS5e3AeAUn40aPA5f/zxHH37Zn6+mzchVdv7FG3aQLNmdkxcZMilhV5rnaiU6gWs\nBfyBz7TWfyqlRgK7tNbLgH7ADKVUX4yJeZ211tp1WQuRib59jat1c1OnGm1uhXBD//xjrPK8c+fz\nKFAHOGh67I/WX7J2bXvWrs3+++TODRMn5ihVkUWuvqJHa70KWGUWG5rq93uB6s7OS4hsWbsWvv7a\nMv7881CnjvPzEcIGycnQpYv59gszuVPkAzAWRbXJ8XuNHg3FiuX4NCILXH2PXgjvceMGWLtvWaAA\njB/v/HyEsNHMmfDDD+bR4UAXIAhYhD2KfLVq0KNHjk8jskgKvRD2MmqU0QXP3Pjx0vVOuK1Tp4wJ\nc5b8gOnADiDn7ZmLFjUWmFjZ50Y4mMuH7oXwCr//DuPGWcajo41heyHc1J0pJUcwFkEZ139DhkC+\nfP5AlRy/R6FC0LCh8atwPin0QuRUemvmg4KMCXjSEUTY0dWrxv5I9rB1q9HMBnYD9YFWwDSef96P\nkSPt8x7C9aTQC5FTM2YYvezNDRoEFSs6Px/hlQ4fhg4dYMcOe5/5J6ABcAmYSUhIPsaPt6Gjo/AY\nco9eiJw4edL6Dc4KFeBNi80Yhci2Xr0cUeS3Y3QVv70cNC8DBrSTKSVeRgq9EDnRty9cvmwZnzYN\ngoOdn4/wSnFxsGaNvc/6HcaV/BXT4wJUrbqR4cOtbKMsPJoUeiGya/VqmD/fMt65szEJTwg72b0b\nkpLsecZYjN3Br5keFyIkZBNz51aRKSVeSO7RC5EdN24Y+26aK1hQ1swLu/vpJ8tYcHD2esXHx6/l\n6tUY4CYAShWlatVYJky4T6aUeCkp9EJkx4gRxuwoc++/bzTIEcKOdu60jI0Ykd769/QtX76ctm3b\nAvEAREZGsnHjRipUqJDzJIXbkqF7IbLqt9+Mgm6uTh147jnn5yO8nrUr+keyeCs9Li6Onj17Eh9v\nFPmSJUvy3XffSZH3AVLohciK22vmzW+YBgfDlCmyZl7Y3cWLsH9/2phSULVq1s4TGhrKqlWryJ8/\nP2XKlGHLli2UKVPGfokKtyVD90JkxbRp1tc4DR5sLKkTws527bKMVawIERFZP9cDDzxAbGwshQoV\nIjIyMufJCY8ghV4IW504YTbIkZMAACAASURBVH1t/D33WN9/Xgg7sDZsX62aba+9cOEC+fPnTxN7\n6KGH7JCV8CQydC+ErV57Da5csYzLmnnhQNYm4tlyf37q1KmUL1+e3bt32z8p4VGk0Athi5UrYcEC\ny/iLL0LNms7PR/iM7EzEmzhxIj179uTChQvUr1+fvXv3OiY54RGk0AuRmevX4ZVXLOOFCsFY6Qku\nHOf4ceOOUWqBgfDgg+m/Zty4cfTp0yflcdmyZbnrrrsclKHwBFLohcjM8OFw5IhlfMIEMLv/KYQ9\nWbuaf+ABCAmxfvyoUaMYkGq+yBNPPMH69evJly+fgzIUnkAKvRAZOX4cPvzQMl6/PnTs6Px8hE+x\ndn/e2kQ8rTVDhw5lyJAhKbGaNWuydu1a8uTJ48AMhSeQWfdCZOTbby33mQ8JkTXzwilsuT+vtWbg\nwIGMGTMmJVa3bl2WLl1KeHi4gzMUnkAKvRAZWbLEMvbaa1C2rPNzET4lOTnzQq+15n//+x8fphp1\natSoEYsWLSI0NNQJWQpPIEP3QqTn/Hn47jvLeIcOzs9F+JwDByx3QA4Ph3vvvfN48eLFaYp8ixYt\nWLJkiRR5kYYUeiHSs3KlZavb0qWhcmXX5CN8irX781Wrgr//ncetWrXiFdOKkDZt2rBgwQKCpaeD\nMCND90KkZ/Fiy1hMjNybF05hS0c8pRQTJ06kSpUqPP/88wQEyI90YUm+K4Sw5sYNWLvWMh4T4/xc\nhE+ydkX/8MOJJCaSpqD7+fnx4osvOjEz4Wlk6F4Ia9avh7i4tLGCBeGJJ1yTj/ApW7da2zspgTlz\n2tOlSxeSzG8pCZEBuaIXwhprs+2bNwcZGhUOFh9v7ISc1i1CQp5izZplAAQGBjJjxgz8/ORaTWRO\nfmoJYS4xEZYvt4zLsL1wgvHjIW1r+jigDTdvrk6J5MmTByVzRYSN5OOgEOa2bjWW1qUWFmZ0wxPC\ngQ4cgJEjU0duAC2AO0X+jTfe4IMPPpBCL2wmV/RCmLM2bN+oEcjaZGFnyclw6NCd5ou9esGtW7ef\nvQY0A7akHD906FCGDx8uRV5kiRR6IVLT2nqhl2F7YWdbtkC7dnD2rLVnLwNNgB9SIqNGjWLw4MFO\nyk54Eyn0QqQWG2u5U52/PzRt6pp8hFc6cQJatIArV6w9exFoCNxZSD9u3Dj69+/vpOyEt5FCL8Rt\nN2/Cyy9bxmvVku1ohV316ZNekQd4jtRF/qOPPuLVV191RlrCS8lkPCFuGz0a9u+3jL/wgvNzEV5r\nxQpYuDCjI8YDhQGYOnWqFHmRY3JFLwTAX38Zhd5c9eqy77ywm2vXwNSaPo2wMChe/Pbv7+Hhh2N5\n8sndvPDCc85NUHglKfRCJCcbHUoSEtLGAwNh2jSQpiTCToYPh6NHzaPJTJ/uxzPPpI7db/oSIuek\n0Avx+efw/feW8QEDoFIlp6cjPNOFC/DJJ/D778biDXNaw9Kl5tEj5M4dQ5kyU4DHnJCl8EVKW/uO\n9HBRUVF6165drk5DeIIzZ+Cee+DixbTxsmWNn9iydl7Y4Pp1ePxx41vGdv8CtYGjREREEBsbS9Wq\nVR2ToPB6SqmftdZR1p6TMUnh2/r1syzyAFOmSJEXNhs+PKtFfh9QEzDG8ePi4jh9+rT9ExMCGboX\nvmzDBvjyS8v4M89Iu1ths19/hQkTsvKKvUBd4BQAISEhLFmyhIYNGzogOyGk0AtfFRcHPXpYxvPl\ngw8+cH4+wiMlJRnzOG3fNfY3oB5gtMMLCwtj+fLl1KlTx0EZCiGFXviqd96Bgwct4+PGQeHCzs9H\neKSpU2HnTst4z55gXrv//fcX3n67PteuXQAgV65crFq1iho1ajghU+HLZDKe8D1//glVqlgup6tR\nAzZvluV0wiYnThjzOK9eTRu/915jOD8o6E5s586dNGzYkEuXLgHGNrNr1qzh8ccfd2LGwptlNBlP\nruiFb5E188JOXn3VssgDTJ+etsgfOXKEevXqcdV0cN68eVm/fj1RUVZ/Jgthd/JTTfiW1ath2zbL\n+JtvGpdiQthg+XL49lvLeNeu8OSTaWMlSpSge/fuABQoUIBNmzZJkRdOJVf0wrd8841lrFw5GDTI\n+bkIj3TtmrFvvLnCheG99yzjSinGjh1LaGgoTz31FPffLx3vhHNJoRe+IzHRuBQzN2YMhIQ4Px/h\nkYYNs9bG1lhil94mh0opRo4c6djEhEiHDN0L3/H995bNccLDoXFj1+QjPM7u3fDhh5bxBg2gQwfj\n98uXL+fZZ58lMTHRuckJkQ65ohe+Y8kSy1ijRtIBT9gkKQm6dTPmc6YWEgKTJ4NSsGjRIp5++mkS\nExPRWjNnzhz8/f1dk7AQJnJFL3yD1tYLfUyM83MRHmnSJLC2anfIEGNrhHnz5vHUU0+lXMn/+OOP\nnD9/3slZCmFJCr3wDbt3W95Y9feHpk1dk4/wKMeOweDBlvFKlaB/f5gzZw7PPPMMSaYWeRUrVmTL\nli0UluZLwg1IoRe+wdrVfHS00fJWiEy8+qox297ctGnwxRef0rlzZ5JNY/r33XcfmzdvJjIy0slZ\nCmGdFHrhG6wV+latnJ+H8DhLl8LixZbx7t1hz57JvPTSS9zuMPrAAw+wefNmihYt6uQshUifFHrh\n/Q4etL6HaIsWzs9FeJSrV62vmS9SBEqU+JBXXnklJfbwww+zceNGChUq5MQMhcicFHrh/axdzUdF\nwd13Oz8X4VGGDjXuz5tr2nQKgwf3TXn86KOPEhsbS4ECBZyYnRC2keV1wvNoDQcOwJkzth0/f75l\nTGbbCyv+++/OnM0TJ2DiRMtjGjaEgQMbsHZtJMePH6d69eqsWrWKPHnyODdZIWwkhV54lkuXoHlz\n2Lo1Z+eRQi9SSUiATp1g3ryMjwsNhSlToHTpssTGxjJs2DBmzpxJrly5nJOoENkghV54lpdfznmR\nL18e7rvPPvkIr/Duu5kXeYDhw6F0aeP3FStWZJ4tLxLCxbJV6JVS9wE1gRJAQSAOOAP8Cnyntbay\neaMQObR2LXz9dc7PExNjtDETAvjnH6PQp08DAylTpgl9+9Z0UlZC2I/NhV4pVRzoBnQB7rodNjtM\nA0lKqQ3AFGCFvr3uRIicuHEDevbM+XmCgox1UUJgTPfo0QPi49M7IhnoBUzh1KlJ7Nq1jscff9x5\nCQphB5kWeqVUfmA40B0IBA4DXwE/AaeAC0AoUAC4B3gciAYaAv8opfpprVfbP3XhU95+Gw4dsow/\n/LDtO88VLw59+xr9SoUAZs+GzZst4/feC3nzJnHwYHfOnPkUgBs3rjFz5kwp9MLj2HJFfwAIBmYC\ns7XWOzN7gVIqD9AeYwRghVKqr9bayvxVIWzw++8wfrxlvHZtiI2VYXiRLefOGe1rzZUuDTt2JNKr\nVxe2b/8iJd6xY0emTZvmxAyFsA9bCv0XwLta69O2nlRrfQWYDkxXSsUAstm3sI3WMGcOLFxodCsB\no+GN+ZafQUEwdaoUeZFt/fuDtT1nPv44ge7dO6WZaNe5c2dmzpwpO9EJj5Rpodda98nJG2itrXQr\nESIdM2cae4FmZvBgqFDB8fkIr7RxozFsb+6pp+L57LMOLFq0KCXWrVs3pkyZgp+f9BcTninL37lK\nqRKmofmMjsmtlCqR/bSEz/rgg8yPqVgR3njD8bkIr3TzpjEBz1xExC0uXWqbpsj36tWLqVOnSpEX\nHi07372HgMyu8l81HSeE7f7+2/jKzLRpEBzs+HyEVxo9GvbvN48mUrx4DOvWLU+J9OvXj4kTJ6Lk\n9pDwcNkp9ArLZXVC5Jy1nvSpKWVc8deq5Zx8hNf5+2+j0JurXj2AVq0eSXk8aNAgxo0bJ0VeeAVH\ndcYrClx30LmFt7JW6Hv3htatjSJfuTLkz+/8vIRXSE42WigkJKSNBwYag0T33TeC+PhbhIWFMXTo\nUCnywmvYVOiVUp3MQg9ZiQH4Y3TLexawsi+oEOk4cQJ+/NEy3quXTLoT2ZKcDL/9ZiyjA9i+Hb77\nzvK4AQOgUiUAxXvvvScFXngdW6/oP8foeofp15amL3O3/4fcAEbkKDPhW5Yts4zde68UeZEtZ89C\ngwbw66/pHXEBGEuZMiMZPDgoJSpFXngjWwv9C6ZfFfAZsARYauW4JOA8sF1rfcmWEyulGgEfYYwG\nzNRav2flmKcwuvNpYI/WuqONeQtPYW3YXnaYE9mgNXTpklGRPwfUB36lWLH9BATMw2j6KYR3sqnQ\na61TVpwqpZ4Hlmit5+T0zZVS/sAkjP91x4CflFLLtNZ7Ux1THhgIVNdaX1RKFc7p+wo3c/mysbDZ\nnBR6kQ2LF8OKFek9exqoB/wBwLZti9myZQv16tVzUnZCOF+WJ+NprWvb8f2rAQe01v8CKKXmYdwS\n2JvqmK7AJK31RdP7n7Hj+wt3sHq15QypYsUgKso1+QiPdfmyMX/TuhNAXcBYwqmU4rPPPpMiL7xe\ntmfdK6XCgNZAFSAvcBn4BVistbZ1xn0k8F+qx8eAR82OqWB6v20Yw/vDtdZrspu3cEPpDdtLkxKR\nRW+9ZczrNPfII//xxx91iIs7AICfnx9z5szhmWeecXKGQjhfdvejbwLMBvKTdk29BiYopV7QWqc7\neJZFAUB5jB3xigPfKaUqm88BUEp1w9hEhxIlpCmfx7h1C1atsozLsL3Iop07YdIky3jt2oc5fLgO\ncXFGD6+AgAC++uor2rVr5+QMhXCN7LTAfRhYhHEVPxdjf/rGpl/nmuILlVJVbTjdceDuVI+Lm2Kp\nHQOWaa0TtNaHgH0YhT8NrfV0rXWU1jqqUKFCWfxTCZfZuPHO5jW3RURIUxyRJYmJxhYJWqeNh4Qc\n5J9/anHItMVxYGAgCxculCIvfEp2xkYHY1y519Bad9Jaf661Xmv6tRPwpOn5QTac6yegvFKqtFIq\nCGNrW/N1VkswruZRShXEGMr/Nxt5C3e0YIFlrGlTY3c6IWw0bRrs2WMe3UdQUE1OnDgKQHBwMEuW\nLKFlS2srg4XwXtkp9DWABVrrHdae1Fr/CCw0HZchrXUi0AtYC/wFfKO1/lMpNVIp1cJ02FrgvFJq\nL7AJeF1rbWVzSeFxdu82tqQ1J8P2IotmzLCM3XtvCPnzGx8YQ0JCWLZsGU2aNHFyZkK4Xnbu0UeQ\ndgKdNUeBDHe4u01rvQpYZRYbmur3Gvif6Ut4i6QkY6w1KSltPFcuaNTINTkJj3TokLWrefj88xIU\nKrSRJk2aMHnyZGrXtueCISE8R3YK/QmMZXEZiQJOZuPcwldMngy7dlnG33wTcud2fj7CY1lbtPHw\nw1CtGkBpfv/9dwICHLWthxDuLztD96uAOkqpN00Nb1IopfyUUv0wOlJYmUotBHDsGAwebBmvVAle\nf935+QiPdqfQ7wQ2AGnv/kiRF74uO/8D3gZigHeA7kqp7zGu3otiTMQrBZwCRtkpR+HpLl821j7F\nxRmPp061nGkPxowqmYQnsuDsWdi6FWAbxuKfRGAVMTHRrkxLCLeSnc54p5RS1YFpGK1rS5odsh7o\nobWWoXsBmzdD8+Zw7VrGx3XrBtWrOyUl4T1WrIDk5M1AM27vjB0Q8AIVKvwDyIdGISCbDXO01oeB\nhkqpSIzOeBEYnfF2a63N18ELX3X+PLRrl3mRL1wY3rPYy0iITE2fvgFoAZhGiyjCM88sJzhYirwQ\nt+Xo5pWpqEthF9YNGHBnM/CMfPgh5Mvn+HyEV/n221Xs2NEauGWK3AVs5KWX7nFhVkK4n+x0xvtG\nKdVYKSWNyEX6tmyBzz7L/Lg2baB9e8fnI7zK0qVLad8+hjtF/m7gOwoVuofHH3dhYkK4oexc0bcF\n2gBnlFJfArO11n/YNy3h0W7dgu7dLeO5c0N0tPH7wEDjnnzv3qCU5bFCpGPhwoV06NCBxMREU6QU\nRi+tUrRsCf7+6b9WCF+UnUL/GNAZeBroB/xPKbUbY5Obr7XWNozVCq82Zgz884/1eM+ezs9HeI2v\nvvqKTp06kZTSaKksRpE3tsyQpopCWMrOrPudwE6l1GsYs2CeBxoCHwHjlVIrgTnAClOLW+HNrlyB\nQYPghx/u7Clvrcg/9pj1q3whbJCUZHxOnDr1SKoiXxHYCBQDIDwc6tZ1VYZCuK9sT8bTWsdj9LRf\nqJQqBDyLUfRjgJbAeaCwPZIUbiopCRo3Nop8RgICYPp02V9eZNv//gcTJwIMxLgvvxCIBYqkHNO4\nMYSEuCQ9IdyaXX7yaq3Paq0nYCy164/RtaKAPc4t3NikSZkXeYB+/aByZcfnI7zSDz/cLvK3DQN+\nJHWRBxm2FyI9din0SqmKSql3gSPAOCAQOGCPcws3lV4bW3OlS8PQoZkfJ4QV8+d/S9euN82iCghP\nE8mTx9jdWAhhKduFXimVVynVUym1A9gLvImxY92nGHvVV7RTjsId9e6deSOckiVh+XIIC3NOTsKr\nvPfee7Rv35a9e9sC8ekeFx4On34KefM6LzchPEmW79ErpZoDnTB6TgYBGmMnidnAIq21+cdv4W2W\nLLG+ZVjnzsbNVIDQUChbVpbOiSzTWvP2228zbNgwU2QlxtYaIwCIijIKu1LGtI9y5SA42FXZCuH+\nsjMZb6np130YxX2OtL31IVevGlfz5ooUgQkT5LJK5IjWmrfeeot33303VbQ2MAAwCvv06fDAAy5J\nTwiPlJ1CPw2jSc4OeycjPMDQocb9eXMffSRFXuSI1poBAwYwfvz4VNEGwGLAuP3z2mtQpYorshPC\ncymttatzsLuoqCi9a9cuV6fhfc6dg2LF7qyXv61RI1i1SobpRbZprXnttdeYmGZ6fVOMZXTGmrkS\nJeDPPyFXLldkKIR7U0r9rLWOsvZcjja1ET5mxQrLIh8aCpMnS5EX2ZacnMzLL7/MtGnTUkVbAfNI\nvdXspElS5IXIjkwLvVJqI8aEu+e11sdMj22htdbSp8qbWJuA162bsYROiGxISkqia9euzJo1K1X0\nKeBLjFW6htatoVkzZ2cnhHew5Yo+GqPQh6V6bAvvuyfgy27cgHXrLONt2jg/F+E1Ll68yPfff5/y\nOCLiWS5fnkXqH025c5s3zBFCZEWmhV5r7ZfRY+Ej1q2DuLi0sYIF4YknXJOP8AoFCxZk48aN1KpV\ni3z5ovnllxlA2u3n3n0XIiNdk58Q3kDu0QvbWBu2b9FC9gQVNjl3Dl5/HbZvt5zmAXeTnLyDPXsK\nYt7Dq1o12fBQiJySQi8yl5hodLgzJ83FhQ3i46FBA9i9G+AmsBt43Owoy/2v/P1h2jT5LClETuWk\nBe4zSqlYpdQFpVSi6dcNSqln7JmgcANbt8KFC2ljYWFQr55r8hEe5f33bxf5GxgbW0YDazN9Xd++\n8NBDDk1NCJ+Q5UKvlApUSi3F2HO+NpAbOGv6tQ4wRym1VCkVmMFphCdZvNgy1qiRsbROiAwcPAgj\nRwJcx+iavQ6jb30McDDd15UsCcOHOyFBIXxAdq7oBwLNMfaJrA2EaK3vwuhqUQfYifE/+g17JSlc\nSGvr9+dl2F5kQmvj/vrNm1eBRsCmVM8OAspafV25ckb/pfBwq08LIbIoO/foO2FsQRuttU7ZUkpr\nnQRsVkpFA38AnYFRdshRuNKvv8LRo2lj/v6yqFlk6uuvYf36S0Bj4E7H7Mcff48vvrB+HRASYjRf\nlP5LQthPdgp9ceDj1EU+Na31LdPQ/is5yky4htZGcd+zx/j9Wiv3UqOjIV8+p6cmPMeFC9CnzwWM\nXvU/p8Tz5v2Adev6Soc7IZwoO4X+BKlbVlkXaDpOeBKtjRlQH32U8XEybC8y8eqrZzl3rj6wJ1X0\nE7744hUp8kI4WXbu0X8FtFVK5bH2pFIqL9AWmJuTxIQLLFqUeZEHaNnS8bkIj7VkySnmzq3NnSKv\ngOm0bfuK3PERwgWyU+hHAruAnUqpjkqp4qaZ+MVNS+t2YEzIe9ueiQoHu3zZ+j7z5qpWhbvvdnw+\nwiPFx8PLL68B/jRF/IBZ5M7d1abPkEII+8vO0P3tPqgK+MLK8wooD9xUaWfUaK21NOhxV4MHw8mT\nmR/31luOz0V4rLFj4eTJzsAZjJn1XwAdePddY5KdEML5slN4v0c2rPEuP/5obDVr7sEHoUoV4/d5\n8kCrVsZEPCGs2L8fRqWssxkAtADu4dFHpY2tEK6U5UKvtY52QB7CVRISjK1mtdlnt/BwWLYMSpRw\nTV7CYxw+fJhChQrTs2cYt26lfuYeaWMrhBuQneh8zaZN8OSTxjjq7a/ffrM8buRIKfIiXXv2QOPG\nULjw35QtW518+VoSG3vT4rj//c8YGBJCuI7cM/clx49D06aW282aq1IFXn3VOTkJj3P8ONSuDRcv\n/gHUA06TnHwCeBZYmHJcyZIwbJiLkhRCpMj0il4p1V8pFZLdN1BKVVFKNc7u64Udffll5kXezw+m\nT4cA+QworOvTBy5e3IPRAfu0KRoO9Epz3JQp0sZWCHdgy9D9O8BBpdQbSimb5s0qQ0Ol1GKMpXgy\neOcOrG1OY65XL4iKcnwuwiMtXw7ffvszRpE/Z4rmxtiNLjrluKeeMob2hRCuZ8tlW2XgA2A0MEop\n9QOwFaOAnwQuYmxoUwC4B3gMqAsUBc5jfMyfZvfMRdacOGHMrk9PYCC0bw/jxjkvJ+FRrl2Dl17a\ngbFBzWVTNAKjyD8KGANCTZvCp5+6JkchhKVMC73Weh/QTCn1BEb/+jZADawvsbu9cP4fYAwwS2t9\n1U65ipxYtswyds89EBtr/D5vXmOPeSHS8dJLWzlzpglw+790fmAdM2ZUpUkTI5I7t/ElhHAfNt+I\n1Vr/APyglOoB1ASeBEpgXMnHYXTI+A3YrLX+M90TCdewttVs69bSxURYlZAAW7bAP/8Yj3/7bTPz\n5zfD2FceoCCwgQYNHuTFF2W3OSHcWXbW0V8FVpq+hCe4fBk2brSMy+Y0wopbt4xvjTVrbkeSgde5\nU+SLALGEhFRi8mQp8kK4O1lH7wtWrTIu0VKLjDT61gthZuzY1EUejB8Ty4AKQDFgC1CJoUOhbFkX\nJCiEyBKbruiVUp2AX7XWVjqrCLdnbdi+ZUtj5pQQqezbB++8Y+2Zu4CNGHfpylGpEvTr59TUhBDZ\nZOtP+s+BNOO8SqnnlVJWxoOFW7l1y7iiNyfD9sKM1tCjB6Y2ttY2OIoEypE7N3z+OQQFOTU9IUQ2\n5aQrSimglp3yEI6ycaOxLiq1iAjZnEZY+OILo0MyfAl0Bb4BmlOvHlSoYBxTpIixRv6ee1yWphAi\ni6T9mbez1iSnWTNj3bwQJufOGX3pYRbwIsbq2bYUKbKapUvryMpLITyY3KT1ZklJsHSpZVyG7YWZ\nAQPg/PlpQBfutMioyIQJ90uRF8LDSaH3Zps2wZkzaWPBwdCokWvyEW5p82aYNetjoEeq6EPExGyk\nQ4fCLspKCGEvWSn01jrhCXcVH2/sPmKufn3Ilcv5+Qi3dOsWtGv3PpB6t8JHyJNnI1OmFHRVWkII\nO8rKPfrhSqnh5kGlVFI6x2uttcwBcJXx42HvXst4+/bOz0W4rSZN3uXcucGpIo8Dqxk7NoKiRV2V\nlRDCnrJSiLPa/0r6ZbnKgQPw9tuW8agoKfQCAK01vXuPYOPGEamiNYEVPPFEbrp2dVVmQgh7s6nQ\na63lXr6n0Bpefhlu3kwb9/c39pn393dNXsKt7Nz5E5MmpS7ydYBlBASEM22a9FISwpvI0Lqnu3wZ\nVq+G//4zHh89CuvXWx732mtQpYpzcxNu4+JFo63tsWPG40OHqgETMe7NNwIWAaG8/jrcf7/L0hRC\nOIAUek92+rTR+ObvvzM+rkQJGD7cGRkJN3TyJNSqBfv3mz/TGygONAGCKVMG3nrL6ekJIRwsy4Ve\nKfUg0BGoBhTCmI1/FvgR+Epr/btdMxTp69078yIPMGmSzLT3YS+/DPv3J2P0qQ83e7ZVyu+mTEHW\nzAvhhWwu9Eopf+BjoBvGRDvzyXa1gNeVUpOBPlprWY7nSCtXwoIFmR/Xtq3RCU/4pCVLYMmSJIxu\ndweBNVgWe+jYERo0cHJyQginyMoV/XiMjhrxGE2wNwPHMQp+MYzZPG2BV4CbwAB7JipSuX4dXnkl\n8+PKlIGJEx2fj3BLV69Cr16JwPPAV6Zoc2AlEJpyXPnyMGGC8/MTQjiHrdvUlsO4oXcEaKS1/sfK\nYZ8ppUZhXDL0VUpN1Vr/a79URYrhw+HIEct4p05Q2NTJrFQpaNfuzmPhcwYNSuD48Y7AwlTRMnTq\nFJTybVG2rDHoU1B64wjhtWy9on8O48q9czpFHgCt9d9KqeeBTcCzwMicpyjS+PVX65df9eoZe4cq\naV8g4IcfbvHJJ08Dqfc66Enjxp/w+ed+8m0ihA+xdbXsE8BfWustmR1oOmYv8GROEhNWJCVBt27G\nr6kFBxszqeSntwCuXbtJ48atSVvk+xASMonJk6XIC+FrbC3092DMqrfVj6bXCHuaPx9++skyPmQI\nlCvn/HyE27lx4wbVqjXnypVVqaIDgAmMHKkoVcpFiQkhXMbWQp8XOJPpUXecBvJlPR2RoSVLLGP3\n3Qevv+78XITbuXbtGk2bNuWvvzakig4B3uPBBxWvveaqzIQQrmTrPfpwjEW4troFyIpce7N2NT9u\nHAQFOT8X4Xb69evH5s2bU0XeBowOOJ98AoGBrshKCOFq0tHaU5w9C4cPp435+Rktz4QARo0aRWRk\nJdOjsdwu8qVLQ/XqLktLCOFiWVlHH6OUKmXjsdJU3d6sXc1XqgThls1PhG8qVKgQ998fy/Hjq4HO\nKfGYGJmnKYQvy0qhf8j0ZSvpjGdP1gr9I484Pw/hNhITEwkIuPNf+MYN+O67IqQu8gCtWiGE8GG2\nFvoXHJqFyNzOnZYxKfQ+69SpUzRs2JC33nqLdu3aAbBuHcSZzaQpWBCeeMIFCQoh3Iat+9HPdnQi\nIgNaW7+ir1bN+bkIu62I4AAAIABJREFUlzt+/Dh16tRh3759dOzYkcDAQGJiYqwuymjRAvz9nZ+j\nEMJ9uHwynlKqkVLqH6XUAaXUmxkc10YppZVSUc7Mzy0cOWJMxkstOBgqV3ZNPsJljh49Sq1atdi3\nbx8AWmtu3rxJYiIsX255fEyMkxMUQrgdmwu9UuplpdRApVS6i3SUUkGmY3raeE5/YBLQGLgP6KCU\nus/KcbmBPmStaY/3sHY1X6WKrJfyMYcOHaJWrVocPHgQgICAAObPn0/79u3ZuhUuXEh7fFiY0RlZ\nCOHbbCr0SqknMLaoDdZaJ6R3nNY6HggCPlFKPWrDqasBB7TW/5peOw9oaeW4t4ExGLvi+R6ZiOfz\n9u/fT82aNTlsWmIZFBTEokWLaNOmDQCLF1u+plEjCA21jAshfIutV/TPA9cwtqrNzHjgKtDFhmMj\ngf9SPT5miqVQSj0M3K21Xmlbql7I2kQ8uT/vM/766y9q1arFsWPHAAgODmbp0qU0b94cMKZwWLs/\nL8P2QgiwfdZ9DSBWa30tswO11teVUrGm1+SIUsoP+ADz9ULWj+0GdAMoUaJETt/afSQlwc8/W8bl\nit4n/PHHH9StW5czZ4wO1KGhoSxfvpy6deumHPPrr3D0aNrX+ftD06bOzFQI4a5svaIvAezPwnkP\nmF6TmePA3akeFzfFbssN3A9sVkodBh4DllmbkKe1nq61jtJaRxUqVCgLqbq5v/+Ga2afryIioHx5\n1+QjnOby5ctpinx4eDirV69OU+QBvvjC8rW1akH+/M7IUgjh7mwt9P5krQGOtvHcPwHllVKllVJB\nQHtgWcpJtL6stS6otS6ltS4F7ABaaK13ZSEXz2bt/nxUlNH+Vni1iIgIRo0aBUDu3LlZt24dtcxa\nHv/+O3z8seVrpUmOEOI2W4fuzwJls3DessC5zA7SWicqpXoBazE+THymtf5TKTUS2KW1XpbxGXyA\nrJ/3aV27dgXgwQcfpJrZv3tyMnTvDomJaV8TFASmOXpCCGFzof8JqK+UitBaX87oQKVUBFAf2JDR\ncbdprVcBq8xiQ9M5NtqmbL2JdMTzKVprlFlj+tvF3tyMGbB9u2V80CC46y5HZCeE8ES2jv9+DeTB\nWPOemU8w7q1/nd2khMmtW7Bnj2VcCr1Xio2NpW7duly5ciXTY0+dgjfesIxXqABvptt2Sgjhi2y9\nov8W+AGjoc3dGOvavzOtfcd0f70mxr6YNYBtWutvHZCvb9mzBxLM2hbcdRdERlo/XnistWvXEhMT\nw82bN2ncuDFr164lV65cKc/v2WN0vrs9L3PbNrhsZWxt2jSjaaIQQtxma697rZRqg3EvvYbp10Sl\n1HnTIQVM51LAHqCtA3L1Pek1ypE9R73KihUraNOmDfHx8QAcOXKEM2fOpBT69euhSRPLe/Hm/t/e\nnUZHVWV/H//uJAQEZR5UEBDaWVEgotAqEGSQVnFARUENoiIOrUC3aDtrO/CgIi5BJlGwxRZUEFAU\nBYR2FqEFh8Y/CMoggsyIDEnO8+IWoZKqJFWpKVX5fdbKSmrfUzebS2DnnHvuOX37QocOMU5WRJJO\nyFO3nXO/Am2B+/AWuakEHO77qOSL3Qu0c85tjH6qFcz+/d5N2KI0ES+lTJs2jUsuuaSgyDdu3JgF\nCxbQrFmzgjYPPFB6ka9bF4YNi2WmIpKswtmPHufcH8CjwKNm1gg4MOXnF+fc2mgnV6GNGBH8/vxZ\nZ8U/F4mJ1157jd69e5OXlwdAs2bNmDdvHk2aNClos3598Al3RT31FNSpE6tMRSSZhVXo/fkKu4p7\nLKxe7XXjimrZEs6OeMFBKQdefvllcnJyyM/PB+DYY49l7ty5NGrUqFC7GSE8YHr11d6HiEgwIRV6\nMzsHWO2c+7nUxl77FsBpzrlJkSRXITkHt9wCu3cXjqelwdixWignBUyYMIHrr78e57w1qE488UQ+\n+OADjgjyTFywzWq6d/d+30tLg1NPhc6dNW1DRIoXao9+PvAQ8PCBgJkNAe50zgUbMLwYuB9QoQ9m\n3z7v0blgZs2Cd94JjN96q7ciniS1119/nX79+hW8PuWUU/jggw+oX79+QNtt22DevMBzPPignrAU\nkdCFWuiD9ReqADWjmEvq27IFcnJg9uzSZ1f5a9gQHnkkZmlJ/HTq1ImWLVuyZMkSWrVqxZw5c6hT\nzM31YD8mDRtC69ZxSFREUobGgePpnnu8h6HDKfLgLWZevXpscpK4qlWrFnPmzCEnJ4e5c+cWW+Sh\n+K1ndfdGRMJR5sl4Eibn4M03w39fjx7aoSTF1K1blxdffLHENnv3Br+Doz3mRSRc6hvEy5o1sDHM\n5QWqVw++NZkkBeccDzzwAJMmhT9VZe7c4LsTF9m8TkSkVOrRx0uwzWnS0+GQQwLjZt506uHD4aij\nYp+bRJ1zjrvvvpuhQ4eSlpZGZmYmvXr1Cvn9wYbtzz8fKlWKYpIiUiGE06MPZz96KSrYcrY33ww7\ndwZ+7NgB//mPZtknKeccgwYNYujQoQDk5+czefLkgsfpSpOXB2+9FRjXsL2IlEU4PfoHzezBokEz\ny4teOilM281WCPn5+dx2222MGjWqIHbhhRcyZcqUgO1ni/P554F3eSpXhm7dopmpiFQU4RT6cJfk\n0AjAAXl58NVXgXEV+pSSn59P//79GT9+fEHs0ksvZfLkyWRmZoZ8nmCL5HTuDH6b2YmIhCzU3es0\naS8Sy5d7Q/L+qlf3Ng+XlJCXl8d1111XaOLdlVdeyaRJk8jICP336bVrva1mi9KwvYiUlQp4PAS7\nP5+VpQeiU0Rubi5XX311oSJ/7bXX8vLLL4dV5AFuvz3wd8L0dLjggmhkKiIVkSpNPAQr9NpuNiU4\n5+jTpw+vvvpqQez6669nwoQJpKenh3WuGTOCL7XQrx8EWSFXRCQkKvTxoIl4KcvMOP/88wsm2t18\n882MGTOGtDBHa3bt8rYzKKp+fXjiiWhkKiIVlZ6jj7W9e4PvK68efcro06cP+/fv55tvvuHJJ58M\neXa9v/vv99ZUKuqZZ6BWrSgkKSIVlgp9rC1d6u1W5+/ww73dSSRl9O3bt8zvXbwYRowIjHfpAmGs\nsSMiEpSG7mMt2P3500/XBuJJateuXdxyyy1s2bIlKufLy4P+/SE/v3C8ShUYNUo/JiISORX6WAt2\nf17D9klpx44ddOvWjVGjRtG1a1e2b98e8Tmfew4WLQqMP/AANG8e8elFRFToY664Hr0kla1bt9K5\nc2c+/vhjABYtWsSsWbMiOueaNXDvvYHxk06CwYMjOrWISAHdo4+lnTvh++8D4yr0SWXz5s107tyZ\nJUuWFMRGjBhB7969IzrvX/8auEMdwNix2rxGRKJHhT6WvvrK24feX/PmULt2YvKRsG3cuJFzzz2X\nZcuWFcRGjx5N//79Izrv9OnBd6i76SZo1y6iU4uIFKJCH0satk9qv/zyC506deJ736iMmTF+/Hiu\nu+66Qu1yc2HiRG86Rl6IWzy9/XZgrEEDePzxSLMWESlMhT6WNBEvaa1bt47s7Gx++OEHANLS0pg4\ncSJ9+vQp1M45uOoqmDo18u85YgTUrBn5eURE/KnQx5J69ElpzZo1dOjQgR9//BGA9PR0XnnlFa64\n4oqAtq+8Ep0if955cPnlkZ9HRKQozbqPlY0b4aefCsfS06Fly8TkIyGrUaMG9erVA6BSpUpMnTo1\naJHfvBkGDoz8+x1yCIwcqWfmRSQ2VOhjJVhv/qSToFq1+OciYalevTrvvvsu7dq148033+Tiiy8O\n2u7OO+G33yL7Xunp8MILcPTRkZ1HRKQ4GrqPFW1kk9Rq1qzJRx99VOy69QsWwIQJgfGuXaFnz9C+\nR+XKcNZZKvIiElsq9LGirWmTxrJly/jqq6/IyckpiOXmwtq1wYt8fr73GFxRNWrAiy/CEUfEKFER\nkTJQoY8F5zQRL0ksWbKEzp07s3nzZgBycnJ44QUYNAh27AjvXE88oSIvIuWPCn0srF4dePO2ShU4\n+eSEpCPBffHFF3Tt2pVt27YBMGjQIOrV68H114e/L+yZZ8KNN0Y7QxGRyKnQx0Kw3nzLllrXtBz5\n5JNP6NatGzt37gS8e/KzZs2hX7/wi3xGhrdsbZqmtopIOaT/mmJBC+WUawsXLqRLly4FRb5OnTrM\nnz+fuXOz+N//wj/fXXfBKadEOUkRkShRjz4WdH++3Jo7dy4XXHABf/zxBwD169dn7ty5ZGaezKOP\nBravXh1qFdPJr1XLW+TmrrtimLCISIRU6KMtL8/bzKYoFfqEe/fdd7n44ovZs2cPAEcccQTz5s3j\nuOOOp1Mn2Lu3cPv0dPjPf6BFiwQkKyISJSr00fb99/D774VjNWvCn/6UmHwqoO++g/Hj4ddfD8bW\nrp3JRx/1JD9/HwBVqzaidet5PPLIMWzfDvPnB55n8GAVeRFJfir00RZs2D4rSzO14mTlSu9y+0bm\n/ewFDmwt15Tdu+cxa1bxK9U0bQr33x+bHEVE4knVJ9o0ES+hnnkmWJEH6AlMAo4FFgAlL0c3apRW\nKxaR1KBCH22aiJcwzsH06SW1uApYCjQu8TxXXOHtJicikgpU6KNpzx74+uvAuAp9XCxeDGvXHng1\nBdgUpFXlEs9x6qnw3HNRTkxEJIF0jz6avv7aWyTd3xFHQMOGicmngjnYmx8J3Aq0oGXLeQweXCek\n9x95JLRt6y1iKCKSKlToo0n35xPKK/TDgUG+yFKqVLmT3r1fSFxSIiIJpqH7aNKOdQmzYgV8881Q\nDhZ5gDMZN+6pRKUkIlIuqNBHkwp9wtxyyyOA/xJ1Z9Gq1XucdFLNRKUkIlIuqNBHS36+160sqnXr\n+OdSgTjnuO+++5gzx/+h947AbHr2rJ6otEREyg3do4+WLVsCJ+IddljxC6VLxJxzDBkyhGHDhvlF\nOwPTgapcdFGCEhMRKUdU6KPFf73VAxo0iH8eFYRzjoEDBzJixAi/aHfgDaAKxx4Lxx+foORERMoR\nDd1Hiwp9XO3du5clS5b4RXoAbwLes3EXXQRmichMRKR8UY8+WjZuDIyp0JfJW2/BG2+Ab7v4YlTh\nsMNmUbt2V7Zta0R+/itApYKjGrYXEfGo0EeLevRRMXky9O4dauvDgPeAQ/D/UW7QAM44I/q5iYgk\nIw3dR4sKfVQ88URxR/YDs4PED6Po76s9emizQBGRA/TfYbSo0Eds5UpYtizYkX1AL7zJdqNLPc/l\nl0c3LxGRZKZCHy0q9BF7661g0b14W8y+6Xs9APik2HNccw1kZ0c9NRGRpKV79NESrNDXrx//PJJY\n4Bazf9Cw4cWsW/deQeSCCwaRk9M26Iz6Zs2gRQvNthcR8adCHy3q0Udk40b4+GP/yO/AhaxbN68g\ncvfdd/Poo49iquQiIiHT0H00OKdCH6GZM71VhD07gfOAg0X+wQcfVJEXESkD9eijYft22LevcOyQ\nQ+DQQxOTTxI6OGy/Ha/If1pw7LHHHuPuu+9OQFYiIslPhT4aiuvNq/cZkl274P33AbYCXYGDuwA+\n+eSTDB48OEGZiYgkPxX6aNCwfUTeew/27gVYDBxc1vaZZ57l9ttvS1RaIiIpQffoo0HL30bk4LB9\nJ+DfQCXOOmuMiryISBSoRx8N6tGX2f79MGuWf+RS4P+4664mCcpIRCS1qNBHg56hL5M1a9bw6acZ\nbNt2RKH4oYc2oVOnBCUlIpJiVOijQT36sK1evZrs7Gy2basCfAgc/MXovPOgSpVEZSYiklpU6KNB\nhT4sK1euJDs7m59//tkX6Q58wYEpI9piVkQkejQZLxpU6EO2fPlyzjnnHL8iXxl4mAM/ilWqQPfu\nicpORCT1JLzQm1k3M1tuZivM7K4gxweZ2XdmttTM5ppZ+ZulpUIfku+++4727duzfv16X6QKMAOv\nR+8ZNAhq1kxEdiIiqSmhhd7M0oGReEuhnQhcaWYnFmm2BMhyzrUAXgf+X3yzDIEKfamWLl1Khw4d\n+LXgWlUF3gG6FLRp3hzuvTcR2YmIpK5E9+jbACuccz865/bhPUTdw7+Bc26+c2637+VnQKM451iy\nXbtg9+7CscxMdUv9LF68mI4dO7Jp0yYAMjIOBd4DOhZqN3q0t3KwiIhET6ILfUNgjd/rtb5YcfoB\ns2OaUbiKe7ROy98C8Pnnn5Odnc2WLVsAqFatBrm57wNnFWrXpw+ce24CEhQRSXFJM+vezPoAWUD7\nYo7fCNwI0Lhx4/glpmH7Eg0bNozt27cDUKtWLapVe5/ff29dqE2tWvDUU4nITkQk9SW6R78OOMrv\ndSNfrBAzOxe4B7jQObc32Imcc2Odc1nOuax69erFJNmggi1/q8VyCkyaNIkOHTpQt25dLr10PmvX\ntg5oM2yYLpmISKwkukf/JXCMmR2NV+B7AVf5NzCzlsAYoJtzLkhVTTD16EtUtWpVZs6cyYcfruOS\nS44LOH722dC3bwISExGpIBLao3fO5QK34s3M+h6Y4pz71sweNrMLfc2GAYcCU83sv2Y2I0HpBqdC\nX8iPP/4YEKta9VCGDj2O/fsLxytVgjFjIC3R40oiIiks4f/FOufecc4d65xr7px71Be73zk3w/f1\nuc65Bs6503wfF5Z8xjhToS/w1ltvccIJJzB8+PBC8RdegI8+Cmx/111wwglxSk5EpIJKeKFPeir0\nAEydOpWePXuyb98+Bg0axEsvvQR4l+fOOwPbH3MM/OMf8c1RRKQiUqGPlAo9kydPplevXuTm5gLQ\nvHlzsrOzAW+lu23bAt8zerQ2rhERiQcV+khV8EI/ceJE+vTpQ35+PgDHHXccCxcupHHjxrz3Hkye\nHPiea64B3+8BIiISYyr0karAhX7cuHH07dsX5xwAJ510EgsWLODII49k924YMCDwPXXq6Jl5EZF4\nUqGPxJ49sGNH4VhamlfNUtzIkSO58cYbC4r8qaeeyvz582ng+yXnn/+EVasC3zdsGNStG89MRUQq\nNhX6SATrzderl/LPiz399NPceuutBa+zsrKYN28eBxYqWrbMK+hFdegAOTnxyVFERDypXZFirQIO\n269evZp/+E2XP/PMM/nggw+oXbs2APn50L8/+OblFcjM9CbgaQsAEZH4UqGPRLDlb1O80Ddt2pQ3\n33yTSpUqcfbZZzNnzhxq1KhRcHzcOPj008D3/eMfcFzgwngiIhJjiV4CN7lVwB49QPfu3Xn//ffJ\nysqiWrVqBfFffoEhQwLbH3ustziOiIjEn3r0kagAhd45V7DFrL/27dsXKvIAAweCb6O6QsaMgcqV\nY5WhiIiURIU+Eile6J1z3HHHHbRp04b169eX2Hb2bHjttcB4To43CU9ERBJDhT4SKVzo8/PzGTBg\nAM8++ywrV64kOzub3377LWjb33+Hm28OjNetC08+GeNERUSkRLpHH4kULfR5eXnccMMNvPjiiwWx\nU089tdCkO38PPwyrVwfGn3qqQiwpICJSrqnQRyIFC/2qVbl07tyXlSv/VRCrXbsPK1a8SNu2wX9c\n/vvfwFh2Nlx9dayyFBGRUKnQRyJYoa9fP/55RMmmTfs5+eQ+7N49xS/aly1bxrFlS3rI56lcGZ5/\nXs/Mi4iUB7pHX1a7dkGQ2ej4VodLNvv27eOMM64oUuT7A+OB0Is8wD33eI/UiYhI4qnQl9XixYGx\npk2hUqW4pxKpPXv20LHjJaxaNc0vehvwPOH+iBx/fPD950VEJDFU6Mvqyy8DY23axD+PKHjiiWF8\n8snbfpG/ASOA8MbemzaF6dP1zLyISHmie/Rl9cUXgbHTT49/HlGQkfF34GPgPeAfwD8B44EH4Pzz\nQztHtWreErcpvp+PiEjSUaEvqyTt0TsHy5fD9997r//4Ax57rAowDZgKXA0YrVvDffdBeni350VE\npJxRoS+LTZsCN1tPS4NWrRKTT4jy8+GGG2DChD1AlSJHDwGuAbw/ypgxKvIiIqlAA61lsWhRYOzE\nE+HQQ+OfSxjGjYMJE7YAZwGPFdvur3+F1q3jlpaIiMSQevRlEWzYvpzfn9+wAf7+901AZ+Br4Csg\nE2/i3UGNGnkr3YmISGpQoS+LJJyId9NNG9i581zgW1/EgJqF2mRkwPjxcNhh8c5ORERiRYU+XM4l\n3US8V15Zz1tvZQPLfZE0YAItWlxLs2ZepH596NMHzj47QUmKiEhMqNCH6+efYePGwrHMTDjllMTk\nU4offlhDTk42sMIXSQdepk6dK5k719thTkREUpcKfbiC9eZbtvSKfTmzatUq2rTJJjd3tS+SAbwK\n9OTpp1XkRUQqAs26D1eS3J9fsWIF7dq1Z/v21b5IJeB1oCcdO2pnORGRikI9+nAlwf353Nxczjvv\nPDZsWOOLVMZbEOc8MjNh9GjtLCciUlGoRx+OvDz46qvAeDnr0WdkZNC9+/N4Bf4QYBZwHqCd5URE\nKhr16MOxfDns3Fk4Vr16uauc69fDSy+dC0zHWwGvA+DtLDdkSAITExGRuFOhD0ewYfusrHKxk0t+\nfj5pvjzuuAN27ADoVqjNmDHaWU5EpKJJfIVKJuVwIt6+ffD8859x9NEtGTHiJx5/HKZODWx33XVw\nzjnxz09ERBJLPfpwlLOJeNu3w5///B++/bY7sIs77sgGFgCNCrWrVw+GDUtEhiIikmjq0Ydq3z74\n+uvAeAJ79FddNY9vv+0G7PJFdgBbA9o9/TTUrh3PzEREpLxQoQ/VunVesfdXt663C0wCPP30HN55\n5y/Abl+kAfAhUHiFvk6doHfv+OYmIiLlhwp9qNatC4w1aZKQB9KnT3+bv/3tAmCPL3Ik3pD9SYXa\n1avnTcDTM/MiIhWX7tGHKlihb9gw7mlMmzaNyy67Auf2+yKNgXlAczp39oo7QNOm3gS85s3jnqKI\niJQjKvShWr8+MHbkkXFNYcqUKVx11VXk5eX5IkcD84EmtGkDs2dDenpcUxIRkXJOhT5UCe7Rf/jh\nh1x55ZXk5+f7Isfg9eQbkZ7uDdGryIuISFG6Rx+qBBf6du3acdpp5/tenYD/Y3QDB8Jpp8UtFRER\nSSIq9KEKNnQfx0K/c2cmP/00Bbgdb3b9EYA3H/DBB+OWhoiIJBkN3YcqWI8+jvfohwyBzZsrA88U\nio8cCdWqxS0NERFJMurRh8K5uA/dP/XUU9x3330ALFwIL7wQ2Oayy+Avf4lZCiIikgLUow/F1q2w\nZ0/h2CGHQM2aMfl2jz32GPfccw8AaWmZTJlyX0Cb6tXhmWcCwiIiIoWo0IeiuEfryrASza+/woIF\nB3aXK8w5x8yZDzFz5kMFsbFjP2DDhiFAZqG2TzwR96f7REQkCanQhyJKw/affQZduwYv8uCAe4DH\n/WLZbNgwg6JF/owzoH//sL+9iIhUQCr0oYhCof/9d+jVq6Qi/zfgab9YV2AacEihlhkZMHYspGl2\nhYiIhECFPhRReLTuoYfgp5+CHcnHe2TuOb/Y+cBUoEpA68GDoUWLsL61iIhUYCr0oYjw0bqvv/a2\nig2UDwwAxvrFLgFepehwPUCrVnD//SF/WxERERX6kEQwdJ+XBzfe6H32V7kyNGt2O99/f7DIN2vW\ni/btJ5GWVqlQWzM4+WS49lqoWjXs7EVEpAJToQ9FBIV+9Gj44ovA+L33Qvv2l9Ot2wR2797N1Vdf\nzYQJE8jI0F+JiIhEj6pKKEK8R791KwwaBHPnwt69XmzLlsC3nnAC3HknZGaezcyZM3njjTd49tln\nSdeuNCIiEmXmnEt0DlGXlZXlFi1aFJ2T7d/vjbMXvU579nhxP1ddBa++WvopFy6Es8+OTnoiIiJm\n9pVzLivYMT2kVZoNGwKLfN26AUV+yxaYMqWkE+0BruOyy1aoyIuISNyo0JcmxGH7t98OnHB30G7g\nQuBFPvkkm9WrV0cvPxERkRKo0JcmxEfrpk0r7gS7gL8A7/tOt4ZpxTcWERGJKhX60oQw4373bnj3\n3cBmr766gzZtuuHtH+95+OGHGThwYHRzFBERKYZm3ZcmhEL/wQfwxx+Fm9SqtY3hw7vxxRefF8SG\nDh3KnXfeGYssRUREglKhL00I9+inTy/aYDNpaV344ovFBZHhw4dzxx13RD8/ERGREqjQl6aUe/S5\nuTBjhv/BjUBnNm9eWhAZNWoUAwYMiFmKIiIixVGhL00pQ/cffwybNx94tRnoCHwHgJkxbtw4+vXr\nF+ssRUREgtJkvNKUMnRfeNi+BnASAGlpaUycOFFFXkREEko9+pLs3Ol9+KtUCerUAbx1dAoX+gzg\nFU4/HQYNuoRevXrFK1MREZGgVOhLUtz9+TRvIGTpUii69k16eiXeeec16ta12OcnIiJSCg3dl6SU\n+/Pjx/8f8ChwcIncc85BRV5ERMoN9ehLUsL9+WXLvuf557OBDcDveAXfuOiiOOYnIiJSCvXoS1LM\n0P2yZcto1649eXkbfMFngFUA9OgRt+xERERKpUJfkiCFfgnQvn1Hdu3a5ItUA2YDzWjXDpo0iWN+\nIiIipdDQfUmKDN1/CXQZN45tu3f7ItXxinw7zODJJ+Ocn4iISCkS3qM3s25mttzMVpjZXUGOVzaz\n13zHPzezpnFLzq9H/ylwLvgV+Zp4O9K1A+Cmm6Bt27hlJiIiEpKEFnozSwdGAucBJwJXmtmJRZr1\nA7Y65/4EDAeGxiO3bdtgx/+8Qr8Q6ALsKDhaG5gLtAHg8MPh8cfjkZWIiEh4Et2jbwOscM796Jzb\nB/wbKDqdrQcw0ff160AnM4v582tbfsun6vZfmIf3W8iugiN18badbVUQefZZqFEj1hmJiIiEL9GF\nviGwxu/1Wl8saBvnXC6wHagT68TSN28kgzwOBdJ9sQYYsAA4paBd9+7Qs2essxERESmbRBf6qDGz\nG81skZkt2rRpU+lvKEX6Bm/Yvg3wLnA88ALN8O4weKpWhZEjIfbjCyIiImWT6EK/DjjK73UjXyxo\nGzPLwNs5ZnMsKE82AAAIu0lEQVSRNjjnxjrnspxzWfXq1Ys4sYxfD6bRDvgGqESzgpiZV+SbNo34\nW4mIiMRMoh+v+xI4xsyOxivovYCrirSZAVyLN/G9JzDPOeeIscrtWvPqBZOpvnMdNXato/rOdeQ3\nOI0hbb2efJcucOaZsc5CREQkMgkt9M65XDO7FXgP71b4BOfct2b2MLDIOTcDeAF42cxWAFvwfhmI\nuVonN+TKGVcWirUAusXjm4uIiERJonv0OOfeAd4pErvf7+s9wGXxzktERCQVJPoevYiIiMSQCr2I\niEgKU6EXERFJYSr0IiIiKUyFXkREJIWp0IuIiKQwFXoREZEUpkIvIiKSwlToRUREUpgKvYiISApT\noRcREUlhKvQiIiIpTIVeREQkhanQi4iIpDAVehERkRRmzrlE5xB1ZrYJ+CmKp6wL/BbF81VUuo6R\n0zWMnK5h5HQNIxfta9jEOVcv2IGULPTRZmaLnHNZic4j2ek6Rk7XMHK6hpHTNYxcPK+hhu5FRERS\nmAq9iIhIClOhD83YRCeQInQdI6drGDldw8jpGkYubtdQ9+hFRERSmHr0IiIiKUyF3o+ZdTOz5Wa2\nwszuCnK8spm95jv+uZk1jX+W5VsI13CQmX1nZkvNbK6ZNUlEnuVZadfQr92lZubMTLOfgwjlOprZ\n5b6fx2/NbHK8cyzvQvj33NjM5pvZEt+/6e6JyLO8MrMJZrbRzL4p5riZ2bO+67vUzFrFJBHnnD68\n2xfpwEqgGZAJfA2cWKTNzcBo39e9gNcSnXd5+gjxGnYEqvq+HqBrGP419LU7DFgIfAZkJTrv8vYR\n4s/iMcASoJbvdf1E512ePkK8hmOBAb6vTwRWJzrv8vQBnAO0Ar4p5nh3YDZgwJnA57HIQz36g9oA\nK5xzPzrn9gH/BnoUadMDmOj7+nWgk5lZHHMs70q9hs65+c653b6XnwGN4pxjeRfKzyHAI8BQYE88\nk0sioVzHG4CRzrmtAM65jXHOsbwL5Ro6oLrv6xrA+jjmV+455xYCW0po0gOY5DyfATXN7Iho56FC\nf1BDYI3f67W+WNA2zrlcYDtQJy7ZJYdQrqG/fni/zcpBpV5D3/DeUc65t+OZWJIJ5WfxWOBYM/vY\nzD4zs25xyy45hHINHwT6mNla4B3gtvikljLC/T+zTDKifUKRUJhZHyALaJ/oXJKJmaUBTwM5CU4l\nFWTgDd93wBtZWmhmpzjntiU0q+RyJfCSc+4pM2sLvGxmJzvn8hOdmBykHv1B64Cj/F438sWCtjGz\nDLyhqs1xyS45hHINMbNzgXuAC51ze+OUW7Io7RoeBpwMfGhmq/Hu683QhLwAofwsrgVmOOf2O+dW\nAT/gFX7xhHIN+wFTAJxznwJV8NZwl9CE9H9mpFToD/oSOMbMjjazTLzJdjOKtJkBXOv7uicwz/lm\nVAgQwjU0s5bAGLwir3uigUq8hs657c65us65ps65pnjzHC50zi1KTLrlVij/nqfj9eYxs7p4Q/k/\nxjPJci6Ua/gz0AnAzE7AK/Sb4pplcpsBXOObfX8msN0590u0v4mG7n2cc7lmdivwHt5s0wnOuW/N\n7GFgkXNuBvAC3tDUCrwJFr0Sl3H5E+I1HAYcCkz1zWP82Tl3YcKSLmdCvIZSihCv43tAFzP7DsgD\n/u6c0widT4jXcDAwzswG4k3My1Hn5yAzexXvl8m6vnkMDwCVAJxzo/HmNXQHVgC7gb4xyUN/JyIi\nIqlLQ/ciIiIpTIVeREQkhanQi4iIpDAVehERkRSmQi8iIpLCVOhFpERmluPbJS8n0bmISPhU6EWk\nXDKzD81Mz/+KREgL5ohIaabhrcAX9RW7RCT2VOhFpETOue14OzWKSBLS0L1IBWNmTX333F8ys+PN\nbLqZbTGz383sIzPrUqR9oXv0ZlbFzLaZ2Ubf5k7BvsfzvvecXyTeycze9X2/vWb2g5k9YWY1iuaH\nb2dD33kOfHwY5cshkvJU6EUqrqOBT4HaeBsNTQVaA7PN7Iri3uSc2wO8BtQDzit63MwqA1cAvwLv\n+sX7A+8Df8bbUGY43p4RQ4BPzKymr+k24CHgJ9/rh/w+XirTn1SkAtNa9yIVjJk1BVb5Xj7pnPu7\n37EsvOK/C2jinNvh68m/CPR1zr3ka9cW+AR4wznXs8j5L8PbuvRp59xgX6wJ3jawe4E2zrn/+bUf\nBQwAxjnnbvSLfwi0d85ZtP7sIhWRevQiFdd24GH/gG+721eAmsDFxb3Rt/f4D8AFZla7yOEDWzlP\n9Iv1ATKB5/yLvM89wE7gat9ogIhEkQq9SMW12Dm3M0j8Q9/nlqW8fyJe8S7YrtnMGgBdgSXOuaV+\nbVv5Ps8rehLn3FZgCd5e5seHlLmIhEyFXqTi+rWY+Abf5xrFHD9gEpDPwR48QG+8p3kmFml74FzF\nPaJ3IF6zmOMiUkYq9CIVV4Ni4of7Ppf4SJ1zbi1eD72NmR3oiV8L7AcmF2l+4FyHE9wRoXxPEQmf\nCr1IxdXKzA4LEu/g+7wkhHO85Pt8rZmdBrQAZjvnNhVpd+BcHYrE8c22Pw3YA3zvdyjPdzw9hDxE\npBgq9CIVVw3gfv+Ab9Z9b7ye9bQQzvEmsANvsl2OL/ZSkHb/wuvp32Zmfypy7BGgOvAv59xev/hm\n3+fGIeQhIsXQyngiFddC4HozOwP4GG/4/Aq8DkB/59yO0k7gnPvDzKYC/YCb8Yrz20HarTazO4CR\nwGIzmwJswlsUpy3wP7zn6f3NBS4D3jSzd4A/gJ+ccy+X5Q8rUlGpRy9Sca0C2gFbgZuAy4HFQHfn\n3GthnOcl3+dKwKvOuX3BGjnnRuHNyP8MuBQYBNQHhgFtnXNbirxlPPA43sjDnXg9/35h5CUiaMEc\nkQrHb8Gcic65nIQmIyIxpx69iIhIClOhFxERSWEq9CIiIilM9+hFRERSmHr0IiIiKUyFXkREJIWp\n0IuIiKQwFXoREZEUpkIvIiKSwlToRUREUtj/B29qUHmfUqIQAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] @@ -237,15 +214,6 @@ "results = pd.concat(dfs)\n", "pivot_plot(results, fig=fig);" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { From f01f3c0cbcbd4c6a8ff351c45a94be04a9c8a310 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 25 Sep 2019 08:52:05 -0700 Subject: [PATCH 004/187] BF: fixing seed in some tests that can fail randomly --- selectinf/algorithms/tests/test_lasso.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/selectinf/algorithms/tests/test_lasso.py b/selectinf/algorithms/tests/test_lasso.py index d5ab6a38c..172535b10 100644 --- a/selectinf/algorithms/tests/test_lasso.py +++ b/selectinf/algorithms/tests/test_lasso.py @@ -30,6 +30,7 @@ except ImportError: statsmodels_available = False +@set_seed_iftrue(True) def test_gaussian(n=100, p=20): y = np.random.standard_normal(n) @@ -64,6 +65,7 @@ def test_gaussian(n=100, p=20): np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) +@set_seed_iftrue(True) def test_sqrt_lasso(n=100, p=20): y = np.random.standard_normal(n) @@ -91,7 +93,7 @@ def test_sqrt_lasso(n=100, p=20): np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) - +@set_seed_iftrue(True) def test_logistic(): for Y, T in [(np.random.binomial(1,0.5,size=(10,)), @@ -118,6 +120,7 @@ def test_logistic(): return L, C, P +@set_seed_iftrue(True) def test_poisson(): X = np.random.standard_normal((10,5)) @@ -139,6 +142,7 @@ def test_poisson(): return L, C, P +@set_seed_iftrue(True) @dec.skipif(not statsmodels_available, "needs statsmodels") def test_coxph(): From a1879b2774cf9fc000ef615bf7712872cf0cb5a5 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 25 Sep 2019 09:56:27 -0700 Subject: [PATCH 005/187] added Lee and ROSI examples to docs --- doc/source/algorithms/LASSO.Rmd | 111 +++++++++++++++++ doc/source/algorithms/LASSO.ipynb | 194 ++++++++++++++++++++++++++++++ doc/source/algorithms/ROSI.Rmd | 110 +++++++++++++++++ doc/source/algorithms/ROSI.ipynb | 186 ++++++++++++++++++++++++++++ doc/source/algorithms/index.rst | 2 + selectinf/algorithms/api.py | 1 + 6 files changed, 604 insertions(+) create mode 100644 doc/source/algorithms/LASSO.Rmd create mode 100644 doc/source/algorithms/LASSO.ipynb create mode 100644 doc/source/algorithms/ROSI.Rmd create mode 100644 doc/source/algorithms/ROSI.ipynb diff --git a/doc/source/algorithms/LASSO.Rmd b/doc/source/algorithms/LASSO.Rmd new file mode 100644 index 000000000..770d31eda --- /dev/null +++ b/doc/source/algorithms/LASSO.Rmd @@ -0,0 +1,111 @@ +--- +jupyter: + jupytext: + cell_metadata_filter: all,-slideshow + formats: ipynb,Rmd + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Conditioning on signs and active set + +One of the first works in this line of conditional inference +is [Lee et al.](projecteuclid.org/euclid.aos/1460381681) which +considers the LASSO (squared-error loss) and conditions +on the active set and their signs. + + +```{python collapsed=TRUE} +import numpy as np, pandas as pd +import matplotlib.pyplot as plt +import statsmodels.api as sm +# %matplotlib inline + +from selectinf.tests.instance import gaussian_instance # to generate the data +from selectinf.algorithms.api import lasso + +``` + +We will know generate some data from an OLS regression model and fit the LASSO +with a fixed value of $\lambda$. In the simulation world, we know the +true parameters, hence we can then return +pivots for each variable selected by the LASSO. These pivots should look +(marginally) like a draw from `np.random.sample`. This is the plot below. + +```{python} +np.random.seed(0) # for replicability + +def simulate(n=500, + p=100, + s=5, + signal=(5, 10), + sigma=1): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0., + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + sigma_hat = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) / np.sqrt(n - p) + L = lasso.gaussian(X, y, 2 * np.sqrt(n), sigma=sigma_hat) + soln = L.fit() + active_vars = soln != 0 + + if active_vars[truth != 0].sum() == s: # ensure we have screened for ease of interpretation + projected_truth = np.linalg.pinv(X[:, active_vars]).dot(X.dot(truth)) + S = L.summary(truth=projected_truth) + S0 = L.summary() + + pivot = S['pval'] # these should be pivotal + pvalue = S0['pval'] + return pd.DataFrame({'pivot':pivot, + 'pvalue':pvalue}) +``` + +Let's take a look at what we get as a return value: + +```{python} +while True: + df = simulate() + if df is not None: + break +df.columns +``` + +```{python collapsed=TRUE} +dfs = [] +for i in range(200): + df = simulate() + if df is not None: + dfs.append(df) +``` + +```{python} +results = pd.concat(dfs) +import statsmodels.api as sm +thresh = 0.001 # POSSIBLE BUG? several very small pivots -- fine for pvalues +grid = np.linspace(0, 1, 101) +fig = plt.figure(figsize=(8, 8)) +plt.plot(grid, sm.distributions.ECDF(results['pivot'][results['pivot'] > thresh])(grid), 'b-', linewidth=3, label='Pivot') +plt.plot(grid, sm.distributions.ECDF(results['pvalue'])(grid), 'r-', linewidth=3, label='P-value') +plt.plot([0, 1], [0, 1], 'k--') +plt.legend(fontsize=15); +``` + +```{python collapsed=TRUE} + +``` diff --git a/doc/source/algorithms/LASSO.ipynb b/doc/source/algorithms/LASSO.ipynb new file mode 100644 index 000000000..7e505805f --- /dev/null +++ b/doc/source/algorithms/LASSO.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conditioning on signs and active set\n", + "\n", + "One of the first works in this line of conditional inference\n", + "is [Lee et al.](projecteuclid.org/euclid.aos/1460381681) which\n", + "considers the LASSO (squared-error loss) and conditions\n", + "on the active set and their signs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np, pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "%matplotlib inline\n", + "\n", + "from selectinf.tests.instance import gaussian_instance # to generate the data\n", + "from selectinf.algorithms.api import lasso\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will know generate some data from an OLS regression model and fit the LASSO\n", + "with a fixed value of $\\lambda$. In the simulation world, we know the\n", + "true parameters, hence we can then return\n", + "pivots for each variable selected by the LASSO. These pivots should look\n", + "(marginally) like a draw from `np.random.sample`. This is the plot below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(0) # for replicability\n", + "\n", + "def simulate(n=500, \n", + " p=100, \n", + " s=5, \n", + " signal=(5, 10), \n", + " sigma=1): \n", + "\n", + " # description of statistical problem\n", + "\n", + " X, y, truth = gaussian_instance(n=n,\n", + " p=p, \n", + " s=s,\n", + " equicorrelated=False,\n", + " rho=0., \n", + " sigma=sigma,\n", + " signal=signal,\n", + " random_signs=True,\n", + " scale=False)[:3]\n", + "\n", + " sigma_hat = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) / np.sqrt(n - p)\n", + " L = lasso.gaussian(X, y, 2 * np.sqrt(n), sigma=sigma_hat)\n", + " soln = L.fit()\n", + " active_vars = soln != 0\n", + " \n", + " if active_vars[truth != 0].sum() == s: # ensure we have screened for ease of interpretation\n", + " projected_truth = np.linalg.pinv(X[:, active_vars]).dot(X.dot(truth))\n", + " S = L.summary(truth=projected_truth)\n", + " S0 = L.summary()\n", + "\n", + " pivot = S['pval'] # these should be pivotal\n", + " pvalue = S0['pval']\n", + " return pd.DataFrame({'pivot':pivot,\n", + " 'pvalue':pvalue})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at what we get as a return value:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['pivot', 'pvalue'], dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "while True:\n", + " df = simulate()\n", + " if df is not None:\n", + " break\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dfs = []\n", + "for i in range(200):\n", + " df = simulate()\n", + " if df is not None:\n", + " dfs.append(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeMAAAHSCAYAAADfUaMwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd1iV5RvA8e8roDgQFy4UR6GZKxO3\npgnuPSFzZcNylWlqpblN0zRnmubItMy9N6KVObCs3CaOHD8V3IoDeH5/PB4PR0hR4bznHO7PdXHF\n+7xvnBtF7vOs+zGUUgghhBDCPGnMDkAIIYRI7SQZCyGEECaTZCyEEEKYTJKxEEIIYTJJxkIIIYTJ\nJBkLIYQQJnM364Vz5MihChYsaNbLCyGEEHa1Z8+eSKWUT2L3TEvGBQsWJDw83KyXF0IIIezKMIyT\n/3VPhqmFEEIIk0kyFkIIIUwmyVgIIYQwmSRjIYQQwmSSjIUQQgiTmbaaOimuXbvGhQsXuHfvntmh\niEfw8PAgZ86cZM6c2exQhBDCKTlsMr527Rrnz5/H19eX9OnTYxiG2SGJRCiliI6O5syZMwCSkIUQ\n4ik47DD1hQsX8PX1JUOGDJKIHZhhGGTIkAFfX18uXLhgdjhCCOGUHDYZ37t3j/Tp05sdhkii9OnT\ny3SCEEI8JYdNxoD0iJ2I/F0JIcTTc+hkLIQQQqQGkoyFEEIIk0kyTkGDBg3CMIwHH3nz5qVFixYc\nO3YMgI4dOxIQEJDsr3vkyBEGDRrElStXkv1rCyGESH4Ou7XJVXh7e7Nu3ToAIiIiGDBgAIGBgezf\nv58BAwYQHR2d7K955MgRBg8eTMeOHcmSJUuyf30hhBDJ67HJ2DCMmUBD4IJSqkQi9w1gPFAfuAV0\nVEr9ntyBOit3d3cqVqwIQMWKFfHz86NatWqsWbOGVq1amRydEEIIR5CUYerZQN1H3K8H+N//eAf4\n+tnDcl1ly5YF4MSJEzbD1MePH8cwDFavXm3zfGxsLLlz56Z///4P2kJDQ6lQoQKenp7kypWLLl26\ncOPGDQDCwsJo1KgRAIUKFcIwDAoWLGiH70wIIVzH1atwcuf/ICbGLq/32GSslNoGXHrEI02A75S2\nA8hiGEae5ArQ1Zw4cQKA3Llz27QXKlSI8uXL89NPP9m0b926lfPnzxMSEgLA/v37qVu3Ljly5GDx\n4sUMHjyY+fPn07JlSwBefvllxowZA8CSJUv47bffWLp0aQp/V0II4SKU4u7mn9njH0Leivk59tVK\nu7xscswZ+wL/xrs+fb/tXDJ8bRuOsJVVqSf/f2Luv7OKiIigS5cueHl5ERQUxObNm22eCwkJYfDg\nwdy5c4d06dIBsGDBAooXL06JEnqGYOjQoRQoUIAVK1bg5uYGQLZs2QgODua3336jUqVKFC1aFIAy\nZcpIr1gIIZLi+nX4/nvUlCmwbx+LgcLAyX5TyNi2GQ/1n5KdXVdTG4bxjmEY4YZhhF+8eNGeL22a\nqKgoPDw88PDwoGjRokRERLBgwQLy5Ek4eNC6dWuuXbv2YMFXTEwMS5YsITg4+MEzu3btolmzZg8S\nMUCLFi1wd3fnl19+SflvSAghXM2KFVC4MHTpgrFvHz2BKcAWwD//HXJnuZ3iISRHMj4D5I93ne9+\nWwJKqW+UUgFKqQAfH59keGnH5+3tze7duwkPD+f06dOcOHGCevXqJfqsr68vVatWZcGCBQBs3ryZ\nyMjIB0PUAOfOnSNXrlw2/5+bmxvZs2fn0qVHzSYIIYSwcfs2dO8OTZpAZOSD5q6kpwOvcqHNn+SL\n2AaenikeSnIMU68AuhmG8SNQAbiqlEr2IWp4uiFis7m7uz/RXuLg4GD69etHdHQ0CxYsoEyZMvj7\n+z+4nydPngQHMsTGxhIVFUW2bNmSLW4hhHBpBw5ASAj8/TcAN4HPycwVhjCXjtRt7c1Hc+03PfrY\nnrFhGD8AvwFFDcM4bRjGm4ZhvGsYxrv3H1kDRAD/ANOBLikWbSrQqlUroqOjWbp0KUuXLrXpFQNU\nqFCBpUuXEhsb+6BtyZIlxMTEULVqVQDSpk0LwO3bKT+0IoQQTuXCBejZE15++UEivg6UJzvDucFk\nyhNQ05vvvoM0dpzIfWzPWCn12mPuK6BrskWUyuXMmZMaNWrQu3dvrly5QuvWrW3u9+/fnzJlytC0\naVPee+89Tp8+Td++falTpw6VKlUCeLCAa9q0aYSEhJAhQwZKlixp9+9FCCEcxuXLMGYMjB8PN28+\naL6UNh0lYvJyLu4UMJ+XXqrE0qVwfw2t3Ug5TAcUEhLCuXPnqFixYoLV0MWLF2ft2rVcuHCB5s2b\n079/f1577TUWLVr04JkCBQowZswYlixZQpUqVR7sOxZCiFTnwAE9L1ygAIwYYZOIL5Ypy3PG85yL\nOw0sJE+eYFavhsyZ7R+moUyaiA0ICFDh4eH/ef/gwYMUK1bMjhGJZyV/Z0IIhxAXB0uXwqRJEBaW\n8H6pUqghQwmakI3Q0EbAXDw8GrJ1K9wfYEwRhmHsUUoluohIalMLIYRwDUrpbUr9+8O+fQnvv/AC\nDBzInSZNmDApPaGhAMeBLEyenLKJ+HEkGQshhHBucXGwcSN89hns2mV7z81Nb13q0gVq1uTsuXOU\n9y/L2bMfAW8AWXj3XXj7bTMCt5JkLIQQwjkdOQJz58L338P9UsMPZMyo54q7doV8+QD4999/KVeu\nJufP/w94HoAqVfSaLrNJMhZCCOE8bt2CH3+EadMS9oJBL4Pu0gX69YOcOR80Hz9+nHLlahIVdQnY\nAFSiWDFYvBju7wY1lSRjIYQQju/oUZg6FWbN0tuUHpYlC7RtC337PugJW1y9epWyZatz+fINYDMQ\nQKlSemQ7Xr42lSRjIYQQjmvHDhg1CpYtS3jPwwMaNIB27fR/E9kcrBRMnuzN5csfAjWAl3j5Zdiw\nAbJnT+ngk06SsRBCCMeiFKxbp5Pw1q0J7xcuDO+9Bx07Qo4c//ll/vprP0OH3mLRonLABwBUqKC/\ndJYsKRP605JkLIQQwnEcOaLnfB86YhaA+vWhWzeoU+extSp37fqTatWCuHs3J/AX4EbNmnr7sRlF\nPR5HkrEQQgjz3bkDI0fqKll371rb3d3h9dfho4+gePEkfamwsHBq1apNTExGYBngRnAwzJlj/zKX\nSSXlMFPQoEGDMAzjwUfevHlp0aIFx44ds8vrG4bBpEmT7PJaQgjx1DZuhNKlYdAgayJOk0ZvSzp2\nDGbPTnIiXrlyB4GBgcTEeAPbAH/efx/mz3fcRAzSM05x3t7erFu3DoCIiAgGDBhAYGAg+/fvJ2PG\njCZHJ4QQJjpyBHr3hpUrbdvLldNbl8qUeaIvd/gwvPbaZOLifIBQwI9Ro3Sn2l5HIT4tScYpzN3d\nnYoVKwJQsWJF/Pz8qFatGmvWrKFVq1YmRyeEECa4eBE+/xwmToSYGGu7l5cepn7vPV056wn89pui\nUSODmzdnAFdwd8/Ft99C+/bJG3pKkWFqOytbtiwAJx6uFnNfoUKF+OijjxK0t2rV6sF5xTdv3qRb\nt24ULVqUDBkyUKhQIbp27cq1a9ce+doFCxakd+/eNm2zZ8/GMAxu3LjxoO3SpUu888475MqVC09P\nTypXrszOnTuf5NsUQghbSsFvv+ltSPnywbhxtom4Y0fdte3W7YkS8Z078M47G6hatSJRUZFAOjJk\nyMXKlc6TiEGSsd1ZknDu3LkTvd+6dWsWLlxo03bjxg1Wr15NSEgIALdu3SI2Npbhw4ezdu1ahg4d\nSmhoaLL0tO/cuUNQUBCbNm1i9OjRLFu2DB8fH4KCgvjf//73zF9fCJHKHDsGY8dC2bJQubIuXRl/\ngVbVqhAerot55MmT5C+rFCxYAH5+q5k+vRFxcXcARY4csGUL1K2b/N9KSpJhajuIuf/uLyIigi5d\nuuDl5UVQUFCiz4aEhPDFF1+wY8eOB8PbK1eu5O7duw+SrY+PD19//bXN1y9UqBBVq1bl1KlT+Pn5\nPXWs33//Pfv27WP//v34+/sDEBQURNGiRfnyyy8ZPXr0U39tIUQq8fff8NNPulBHYqcngd7w27s3\ntGjxxBO6+/fDm2/Czp3LgNZAKWADRYtmY8UKKFLkWb8B+3OuZOwIM/BPeP5zVFQUHh4eD679/PxY\nsGABuXPnfpCkQa98dnNzo0yZMhQpUoQFCxY8SMYLFiygevXq5MqV68Hzc+fOZezYsRw9epSb8Q7L\nPnLkyDMl402bNlG2bFkKFSpkE1/16tV51PnTQohU7tw5vWR57lz488/En/H0hDZt9D7i+1N2T+q7\n7+DddyE6ehXQCggga9a1DB6chc6dHaPO9NNwrmTshLy9vdm0aROGYZA7d27y5s2LYRiEhYXx6quv\nPniuevXqhN0/BDs4OJiZM2cyduxYrl+/zrp165g4ceKDZ5cuXUr79u157733GDFiBNmyZePcuXM0\na9aM27dvP1O8kZGR7Nixw+YNhMVzzz33TF9bCOGCLOUqV6zQRxk+LF06qF1bH2PYrBlky/ZULxMd\nDT16wIwZlpaXSZOmDd27T2TQoMwOV1HrSUkyTmHu7u4EBAQkaC9btiy7d+9+cO3l5fXg8+DgYIYO\nHcovv/zC8ePHiYuLo3nz5g/uL1y4kAoVKjBlypQHbVsTKxn3EE9PT+7Gn6sBLj9UcD1btmwEBATY\nDINbpHPkTXpCCPtRCtav10U6Evvd4+mpk2/r1joRZ8r0TC8XEaFHs/fuBdgCVOOFF/KyaNGcpG4/\ndnjOlYyfcIjYkXl5eSWapAGKFy9OiRIlWLBgAcePHycoKIjs8SqaR0dHJ0iM8+bNe+xr5suXj4MH\nD9q0bdiwweY6MDCQDRs24OfnR05HOc5ECGGumzchLAx279bHFu7eDZGRCZ+rXl2vlm7ZEry9k+Wl\nt27ViTgqCuAboDNlyoxm27bez5rjHYpzJeNUJDg4mPHjx3P16lWmT59uc69WrVp07dqV4cOHU6FC\nBdasWcPmxOq4PqRZs2Z0796dESNGUK5cORYvXsz+/fttnmnfvj1Tp06lRo0a9O7dm8KFCxMVFcWu\nXbvInTs3PXv2TNbvUwjh4JYtg86d4cKFxO+7u+t54I8+ghIlkvWlp0/X08t6+cokoDslStTn11+7\nkT59sr6U+ZRSpnyULVtWPcqBAwceed8ZDBw4UGXPnv2p/t+jR48qQKVLl05duXLF5l5MTIzq1auX\n8vHxUV5eXqp58+Zqx44dClArV6588BygJk6c+OD67t27qmfPnipXrlwqS5YsqkePHmratGkKUNev\nX3/w3JUrV1SPHj1Uvnz5lIeHh/L19VXNmjVTv/zyyyNjdoW/MyHEfZcuKdW2rVJ6TDLhR5YsSvXo\nodTJk8n+0vfuKfX++/Ff7ksFqFdeaaJu376d7K9nL0C4+o+caCiThn4DAgLUo1bnHjx4kGLFitkx\nIvGs5O9MCBcQG6t7wz16wNmz1vY8efQccLly+uP55x97ctLTUErX//juO0vLGdKkKUq9evVZunRe\nootLnYVhGHuUUonOT8owtRBCCF2icuZMmDoVHq4Q2LYtTJgAWbOmeBhffRU/EUPz5r706fMbZcsW\nw93ddVOW635nQgghHu/MGfjss4SVsQBy5tQHNjRtapdQQkP11DMooD8VKuRh4cJupElT0i6vbyYp\nhymEEKnRrVswdKguVzVzpm0izp4d+vTRpa7slIhPnNCj4LGxCugNjKBUqf0YhuvsonkU6RkLIURq\ncvo0rFkDw4bBv//a3qtQQS9fbt1a7xW2k1u3dD2QqCgF9AAm0alTd6ZNG4/hCJUX7UCSsRBCuLK4\nOFi7Flatgs2b4ejRhM+ULq1PUYpXFdBerl2D4GDYu1cB7wHTeO21XsyYMTrVJGJw8GSslEpVfxnO\nzKxV+UKI/3D3rp4HHj0aDh1K/JmcOWH4cHjjjSc+Pzg5RERAo0Zw4ACAARSlbt2PmTdveKr73e+w\nydjDw4Po6GgyZMhgdigiCaKjo516y4EQLuPWLb0ieuxYvTjrYZ6eUKUK1Kmji3lkzmz/GIlfWSsG\nOAK8SP/+PRk61JRwTOewyThnzpycOXMGX19f0qdPn+reJTkLpRTR0dGcOXPG5lQpIYSd3bmjT1EY\nNgwePns8c2Z4+21o2BAqVrTrfPDDYmL0Lqm+fSEm5h7wOrCOiRMP061b0s8zdjUOm4wz33+3dvbs\nWe7du2dyNOJRPDw8yJUr14O/MyGEHZ09CytXwuefw8mTtvdy54aePXUPOJlqRT+LPXvgnXfg998B\n7gDBwHJ69PgyVSdicOBkDDohyy94IYSI5+ZN2LABNm3SG3MTmw/Omxf699dzwSb2gi2uX4cBA2Di\nRMspi7eBFsAaBg+eyGefdTM3QAfg0MlYCCEE+pCGlSth+XLYuBH+69zyHDng44/hvfdwhJMUlNKV\nNbt3t52+dnefQGzsWiZNmkaXLu+YF6ADkWQshBCOSCnd850yRSfh2NjEn0ubVi/IatBAjwHHOxvd\nTKdOQbdu+j1EfLVrw/jxPTlzpiyBgYHmBOeAJBkLIYQjuX4dZs3SSfjw4cSfKVFCL8YKCoLKlR2i\nF2yhlA69b189om7h43ONYsXeZ86ckeTOnYsXXpBEHJ8kYyGEcAQXLuhlxpMnw5UrCe9Xrqz3AjVp\nAs89Z//4kuDaNXjrLVi40La9Y8cr/P13XbZv38OePS1p0KCBOQE6MEnGQghhpn//hZEjdX3oh+eC\nvbygQwc9B/zii+bEl0R//QUtW9oW+CpeHMaMuUT//rX566+/WLRokSTi/yDJWAghzHD+PIwYoQt0\nPHxa0vPPw4cfQrt2kCmTOfE9gXnzdI84/nuJLl3g448v0rBhLQ4dOsSyZcuoX7++eUE6OEnGQghh\nTxcu6EN7x4/X1bLiK1tWT7Y2b25KecqnsXKlfs9gqYibMSNMnw6vvQYXLijc3d1ZuXIltWrVMjdQ\nByfJWAghUppSsH27Xtm0cCE8XMioQgUYMgRq1QInqja4d69OupZE/OKLsHgxZM16nnv3spEzZ052\n7dpFmjRyWu/jyJ+QEEKklNhYmD8fXnoJqlbVn8dPxKVL667lb7/pPT9OlIjPntWHPFhWTBcqBGFh\nkCHDKapUqULnzp0BJBEnkfSMhRAiuVmqXQwYAPv3J7xfuTJ88IFeHe2EyermTWjcWB+NDLr09apV\ncOPGcWrWrMnly5cfJGORNJKMhRAiOYWG6nnf8HDb9gwZ4PXX9cqml14yJ7ZkEBcH7dvrOtOgp7YX\nLQIPj6O88kpNbt26xebNmylbtqy5gToZScZCCJEc/vkHevfW1bLiy5RJ94I//BCyZjUntmQ0bBgs\nWWK9njQJataMpWTJJty+fZvQ0FBKly5tXoBOSpKxEEI8i6tXdYYaP952PjhdOujaFfr1Ax8f8+JL\nRitWwMCB1uv334d33wVwY+bMmXh5eVG8eHGzwnNqzjdZIYQQjiA2FqZNA39/GDPGNhF36KB7yl9+\n6TKJ+NAhaNvWel2zJrRtu5dJkyYBULFiRUnEz0B6xkII8aRCQ/XQ899/27ZXrqz3EJcrZ05cKeTq\nVWjaVJfNBihQAD7+OJzatWuTKVMm2rdvL8fdPiPpGQshRFIopffu1K0LgYG2idjPD378EX75xeUS\nsWXBluXMivTpYdCg32jRIhBvb2+2bdsmiTgZSM9YCCEeJS5OL8oaNQp27rS9lzGjnhPu1cuhTk5K\nTkOG6Llii969t9G9ewNy585NaGgo+fPnNy84FyLJWAgh/su+fdCxo3Ufj0WaNLoG5IgRkDevKaHZ\nw7JlMHiw9bpXL8if/zD58uVj8+bN5HXh793eDGWpY2ZnAQEBKvzhfXhCCOEIYmJg9GgYNMj2EId0\n6XRy7t1bH+bgwg4ehPLl4cYNfV2jxjU2bsyMuzvcvn0bT09PcwN0QoZh7FFKBSR2T+aMhRAivkOH\ndOnKTz6xJuJ06XQhjxMn9ClLLp6ILQu2LIk4V65V7N1bkPDwHQCSiFOADFMLIQTo8/9GjoTPP7ft\nDZcrB3PmQLFi5sVmRzExegvTkSP6Om3apURFBfPSS6UpUqSIucG5MEnGQggRGgrvvWfNQAAeHnrC\n9KOPwD11/KqMi4M339R1prWfiIlpQ/ny5Vi3bh3e3t5mhufSUsdPmBBCPOzyZVi9Gn76SZ+cFF+F\nCvpQ3pIlzYnNBErpstnffWdp+RXDeI0qVaqwevVqvLy8zAzP5UkyFkKkHrdvw/ffww8/wNatuopW\nfJkz62Hqzp31CQiphFK6dPa0ada2Tp0q4u8/gu7du5ExY0bzgkslJBkLIVzf1avw9de6Otb584k/\n06qVri+dJ499YzOZUvDpp/qPRptLs2av8s03+XBz62tmaKmKJGMhhOu6eVP3dCdOhGvXEt4vX14v\nG27aNNUs0IpPKb1ofORIS8sE4H3y5OmGm9tEEyNLfSQZCyFc06pV+tSkU6ds2319dV3p117Tn6dS\nSukiHuPGWVpGA31o0qQZ48Z9aWJkqZMkYyGEazlzBnr0sD10F6BoUb1X+PXXIW1ac2JzEHFx0K2b\nHrnXhgP9adUqmHnz5uLh4WFidKmTJGMhhGu4ckUfZfjVV3p42iJ7dn2UYbt2uoxlKhcdrQcMZs16\n0IK39080bNiO2bNn4p5KtnE5GvlTF0I4txs3YMIEXb7yyhXbe506wRdf6IQs2LhRb6c+dgxAATG0\naZOe8eO3kjWrF26paAW5o5FkLIRwTseP6704M2ZAVJTtvZIl9aKt6tXNic3BnD+vty7Nn29pUUAv\n8uf/h2+/XYynZxYToxMgyVgI4UzOn4ft2+Hbb2HNGr0KKT5/f101KzhYhqTv++03aNgQLl2ytMSR\nNm0P7t6dTNOmPUiXTtKAI5C/BSGEY4qNhb17YcsW2LEDdu9OuDLaokABGDAAOnRINaUrk2LrVmjQ\nIP4UehzPPdeZY8dm0Lt3b7744gsMwzAzRHGf/NQKIRzHtWt6LHX9eggLSzgH/LC6dXUNx/r1U1XF\nrKTYtAkaN9YLtgB8fKBKlZ4sWzaDTz/9lKFDh0oidiCSjIUQ5jt/Xi/CmjxZV8v6L56eUKaMngt+\n802XP8rwaa1ZA82bw507+jpPHti8GW7dak+lSr706dPH3ABFApKMhRDmiYjQ25FmzrRmjvhy54bA\nQHjlFV0tq3hxfZqSSJRSek1bjx5w755u8/W9R9++yylWrCVQlrJly5oao0hckpKxYRh1gfGAGzBD\nKTXyoft+wBwgy/1n+iml1iRzrEIIV7F3L4wapU9Miouzvefvr4ee69SBF14AGUpNkhs39PkW1hXT\nUKDAHfz9W9OjxwoqVdpNQECAeQGKR3psMjYMww2YDNQCTgO7DcNYoZQ6EO+x/sBPSqmvDcN4EVgD\nFEyBeIUQzuzQIV2Kcv36hPfKloV+/aBZM5n/fUIHDkDLlnDwoLWtZMlocuRowaZNa5k8ebIkYgeX\nlLX/5YF/lFIRSqm7wI9Ak4eeUUDm+597A2eTL0QhhEtYuBDKlUuYiGvV0quNdu/WGUUS8RP5/nv9\nxxo/EXfseAsfn8aEha1j+vTpdOnSxbwARZIkZZjaF/g33vVpoMJDzwwCNhiG0R3ICAQlS3RCCOd3\n757u8Y4da21Lk0Yn3j59dI9YPLHbt/UgQ/wziNOn1/WmfXzCaNIkjFmzZtGhQwfzghRJllwLuF4D\nZiulvjQMoxIw1zCMEkopm8kgwzDeAd4B8PPzS6aXFkI4rDNn9OlIP/9sbXv+eVi0CEqXNi8uJxcR\noY9f/v13a1vRovDTT4pSpQygPocPH6Zw4cKmxSieTFKGqc8A+eNd57vfFt+bwE8ASqnfAE8gx8Nf\nSCn1jVIqQCkV4OPj83QRCyEcX2wsTJqkzwiOn4gbN9bD0ZKIn9ry5fDyy7aJODgYNmy4TPfuNdi4\ncSOAJGInk5RkvBvwNwyjkGEYaYEQYMVDz5wCAgEMwyiGTsYXkzNQIYST+OMPqFQJuneH69d1W5o0\nMGIELF0KWaQO8tO4dw9694amTa1bsT089HueSZOiaNo0kN9++41oS5UP4VQeO0ytlIoxDKMbsB69\nbWmmUmq/YRhDgHCl1AqgFzDdMIye6MVcHZV6uGisEMKl3bgBn30G48fbblcqWhS++UbvFRZP5cwZ\n3fv99VdrW4ECek1cgQIXqFkziCNHjrB8+XLq1atnXqDiqSVpzvj+nuE1D7V9Fu/zA0CV5A1NCOE0\nli/Xp9WfPm1tS5cOPvkE+vbVn4un8vPPuppWZKS1rWFDmDMHDOMyVarU4MSJE6xatYqgIFk766yk\nApcQ4ulFROiz+ZYvt22vWVMv6y1SxJy4XMTSpXr9m6U4mWW0/6OP9Odxcd4EBQXRokULqstxkU5N\nkrEQ4sn9VwUtHx+9hen116Vy1jOaOhW6drX+8ebMqf+4q1eHkydPopSiYMGCTJgwwdxARbKQZCyE\nSLpff4Vhw2DduoT33npLJ+hs2ewflwtRCgYNgiFDrG3PP69rpRQuDBEREbz66qtkz56dPXv2yMlL\nLkKSsRDi8fbsgf79E0/CtWrBwIFQRZaNPKuYGF2We/p0a1u5crBqle4ZHzlyhJo1axIdHc3SpUsl\nEbsQScZCiMQppfcEjxoFS5bY3jMMXXVCKmglm+hoCAmBFfE2jtapo+ujZMoEBw4cIDAwkNjYWLZs\n2UKpUqXMC1YkO0nGQghbJ07ogsdz58KRI7b3DAPatoUBA/TpSiJZXLoEjRrB9u3Wtnbt4NtvrSdG\n9unTB6UUYWFhvPjii+YEKlKMJGMhhHbypC52vGxZ4vdbtNATmZIIktXp07oHfCDeOXgffQQjR+oV\n0xZz584lMjISf3kT5JKSUoFLCOHKYmLgyy91kn04EWfKBB066DnjRYskESezq1cTJuKxY+GLL3Qi\n3rVrF8HBwdy+fZusWbNKInZh0jMWIjXbvl2vGPrzT9v2evX0OGmTJpAhgzmxubiYGGjd2pqIPTx0\nIY/XXtPX27dvp27duuTIkYOoqCh8fX3NC1akOEnGQqRGf/6p531XrrRtL1FCn8lXubI5caUSSkGP\nHrBhg7Vt1ixrIt62bRv162Owzb8AACAASURBVNcnb968hIaGSiJOBWSYWojUQildrCMkBF56yTYR\np0+vJyl//10SsR1MmKALlFl89pmukwIQGhpK3bp18fPzY+vWreTLl8+cIIVdSc9YCFcWE6MLdSxf\nrueDjx+3vW8YOjkPHw6FCpkTYyqzcCH07Gm9DgnRRT4scuTIQYUKFViwYAE5c+a0e3zCHJKMhXBF\nN27AjBl6YVb8wxvia9IEhg6FkiXtG1sqde8efPopjB5tbatYUQ9PGwYcPHiQF154gVKlShEaGioF\nPVIZGaYWwpVERupqWH5+uvv1cCL28tJdsZ07dU9ZErFdnD2rz86In4ife07/FXh6wuLFiylVqhQz\nZswAkEScCknPWAhXcPs2fPWVHm6+ccP2no8PtGype8I1ashxhna2eTO0aQMXLljb6teH776D7Nnh\nxx9/pG3btlSoUIHWrVubF6gwlSRjIZyZUrB4sa4SceKE7b3ChXV7hw56gZawq7g4fabGoEH6rwn0\n3uFhw/QRz2nS6EIeHTt2pGrVqqxatQovLy9TYxbmkWQshDO6d0+PcX71lW0NRYBixfTy3JYtwV3+\niZvh4kVdNTT+1qVcueDHH/XgBOhjEDt16kSNGjVYsWIFGTNmNCVW4RjkX6oQzuTMGX2kzzffwLlz\ntveyZ9flKt95R5KwiXbu1JVDz5yxttWoAT/8ALlzW9sKFCjAmjVrqFq1Kull5CLVk3+xQjiDa9f0\nfPBXX8Hdu7b33N2hWzfdG86a1Zz4BAChofrAh1u3rG2ffAKDB1vfH02cOJFChQrRsGFDatWqZU6g\nwuHIamohHFlsrO4J+/vrgsXxE3GePHrl9IkTMG6cJGKTrV8PDRpYE3G2bLB6tX4PZUnEo0ePpkeP\nHvzwww/mBSockvSMhXAkMTG6WPHu3fpjy5aExxiWLw+9e0PTptbz9YSpVq7UU/SW90q+vnoVddGi\n1meGDRvGgAEDCAkJYc6cOeYEKhyWJGMhHMGNG7pG4pdf6sNtE5M/P4wapfcJyz5Uh7FkCQQH6/dR\nAAUK6OHqwoX1tVKKgQMHMnToUNq1a8esWbNwc3MzL2DhkCQZC2Gm6GiYOhU+/1wvwU1MhgzQrx/0\n6iUnKDmYvXv1HmJLIn7uOd0jLlDA9rlLly7x5ptvMm3aNEnEIlGSjIUww40bei74yy9tl92CXnJb\nqRKUK6c/ypeHzJnNiVP8p6tXoVUruHNHXxcponvElgOWlFJcuHCBXLlyMWHCBADSpJFlOiJxkoyF\nsKfISJg4UX9cvmx7z89Pr4ju0EG2Jjk4peCtt+Cff/R1pkywYoU1EcfFxdGtWzdWrFjBH3/8gY+P\nj3nBCqcg/+KFsIebN/Vq6DFjbPe9gO4Jf/opvP22lKp0EhMnwqJF1usZM6yLtWJjY+ncuTPffvst\nffr0IUeOHOYEKZyKJGMhUlJcHMybp+d8z561vSflKp3Szp16MbtFly56ARdATEwMnTp1Yu7cuQwY\nMIDBgwfLoQ8iSSQZC5FSdu+Grl31f+MrVUpXgmjRQoajncxff0GzZroaKUDZsjB2rPX+yJEjmTt3\nLkOHDqV///7mBCmckvwmECK5Xb0K/fvD5MnWEwJAD0d//jm0b69PCRBOJSxMH3x17Zq+9vaGhQtt\nZxZ69OhBgQIFaNeunSkxCuclvxGESC5K6YnEYsVg0iRrIk6XTveEjxyBjh0lETuhRYugTh1rIs6c\nGZYvh0KF4M6dO3z22WfcvHmTzJkzSyIWT0V6xkI8q5s3Yf58mDJFbzyNr25d3UO2VIAQTmfqVD0v\nbHlvlTs3rFsHpUtDdHQ0zZo1Y/369ZQtW5YmTZqYG6xwWpKMhXhaZ8/qilhz5uih6fhy54bx4/VG\nVFnA47RWrrRNxEWK6BrUBQvCzZs3ady4MVu2bGHGjBmSiMUzkWQsxNNYuRLeeAOiomzb06fXG1CH\nDIEsWcyJTSSLw4f1mcSWRFyunD74wccHrl+/ToMGDfj111+ZM2eODE2LZybJWIgncfu23o40aZJt\nu7+/7kJ16CCnJ7mAa9f0ORyWOWI/P2siBjh//jwRERHMnz+fYMu+JiGegSRjIZLqjz/0Aqy//rK2\n5cunJxXr1ZOFWS4iLg7atYNDh/R1+vSwbJlOxDdv3iRDhgw8//zzHDlyhAxSK1wkE/ntIcTjHDqk\nqzq8/LJtIm7WDP78Ux9iK4nYZQwdqktbWsyYAWXKQGRkJFWrVn2wf1gSsUhO8htEiP9y7JjuCRcv\nDj/9ZG339ISvv4bFi/UJ8sJlzJ0LgwZZr3v10qcynT9/nldffZVDhw5RrVo10+ITrkuGqYV42B9/\n6FXSCxfqMcv4mjaFESP0XmLhUtatg06drNeBgTByJJw9e5bAwEBOnjzJqlWrCAwMNC9I4bIkGQth\nsX27XgW9fn3Ce7Vrw7BhekmtcDm7dunqpJZziUuW1IU+lLpHUFAQp0+fZt26dbzyyivmBipcliRj\nIU6e1Ac5/Phjwnu1a+sTleSXsMs6ckRP+1sO0/Lz071kvTPNgyFDhpA3b14qV65sZpjCxUkyFqnX\njRt6OHrMGL1lySJNGl2so08fvWhLuKw9e/Q6vMhIfZ09ux4YiY4+xooV+2ncuDEtW7Y0N0iRKkgy\nFqnP4cN6Adbs2QkrZ7VuDcOHw/PPmxKasA+l9I60Dz6Au3d1W/r0sGoVGMZhXnmlJnFxcQQGBpIx\nY0ZzgxWpgiRjkTpcuaKrNsyaBZs3J7xftix89RVUrWr/2IRd3bgBnTvrcuIWmTPrBfOZMx+gevWa\nKKXYtGmTJGJhN5KMhes6exaWLtUVG8LCrKtz4nv+eT0nLMcapgoHDkDLlnDwoLWtdGm9WOvWrb+o\nUSMINzc3tmzZQjFZMS/sSJKxcC03bsCSJXrD6ObNtucJW6RJA40aQdeuev+KJOFU4fvvdY/YslAL\ndBnxCRP0EPWgQUtImzYtoaGhFClSxLxARapkqMR+WdlBQECACg8PN+W1hQs6dw4GDoR582x/28ZX\nrpw+Hb5dO71kVqQKt2/D++/DN99Y29Kn18sGOnSAmJgY3N3dUUoRGRmJj6UAtRDJzDCMPUqpgMTu\nSc9YOLfoaBg3ThfiuHnT9p5hQI0aelyycWNdR1qkKpcu6d1pe/ZY24oW1fVcSpaEX3/9lTfeeIOV\nK1dStGhRScTCNJKMhXOKjtYTfQMG6H3C8RUvrnu/bdpA/vzmxCdMd/cuNG9um4iDg2H6dPDygrCw\nMBo2bIivry+ZMmUyL1AhkGQsnElcHGzdqueDFy2C69dt75coAWPHQlCQ7hWLVEspePdd/eNiMWEC\ndOumfzQ2bdpE48aNKVSoEJs2bSJPnjzmBSsEkoyFs1i3Dnr0gKNHE97LkUMftfPWW+AuP9JC13KZ\nNct6/fnn0L27/nz79u00bNiQIkWKsGnTJnLmzGlOkELEI8tIhWM7dw5CQvR5wQ8nYn9/nYSPHtXd\nIEnEAn2Y1scfW6/feAP69rVely5dmk6dOrFlyxZJxMJhyG8v4Zhu3YJvv4X+/eHaNWu7tze0bavn\nhMuXl+FoYeP33/WPhkX16rrSlmHAxo0bqVixIl5eXkyZMsW8IIVIhPSMhWM5ehQ+/BB8ffWwdPxE\n3K6druo/aRJUqCCJWNi4cEGfcBkdra/9/fWW87Rp4YcffqBevXoMGDDA3CCF+A/SMxaOYe9e+OQT\nWLs24T1/f70pVM6RFf/h3j1dVvzff/W1t7euM50tG8yZM4dOnTpRrVo1hg0bZm6gQvwH6RkLc50/\nD2+/rU9HejgRFy6sT1T66y9JxOKReve2rpw2DF13ukgRmDFjBm+88QY1a9ZkzZo1soVJOCzpGQtz\nXLwIM2boZa7xtygZhj5ctksXqFNHSlWKx5o9W29bshg6FOrXhxs3bjB48GDq1q3LkiVL8PT0NC1G\nIR5HkrGwn9u3YcUKvU943bqEBzfUr697wlKgXyRBbCxMm6aXGFg0b65nO5RSZMqUiZ9//pk8efKQ\nLl068wIVIgkkGYuUd/48jB+v532vXEl4v1gxXayjbl37xyac0p9/6kMfdu60tr34ou4ljxo1krNn\nzzJ+/HgKFixoVohCPBEZAxQp59gxeO89KFBAD0c/nIgrV9ZD1X/+KYlYJMmlS/DRR/r46fiJuEgR\nWL5cMW7cED7++GMiIyOJjY01L1AhnpD0jEXyu3gRPvtMH5MTF2d7r1AhfVRO27bw3HPmxCeczvnz\nevBkyhR9SqZF2rS6wEffvophw/ozYsQIOnTowLfffoubm5t5AQvxhCQZi+Rz967eAzxkCFy9anuv\nXDno108fYSi/JEUSXb+u39dNm2bdP2xRo4Yu6FG0KHzyyad8/vnnvP3220ydOpU0svBPOBlJxuLZ\n3bih95KMGZOwZGXNmvDpp/Dqq1KkQzyREyegUSPYt8+2vXhxfVhX69bWH6lKlSrx/vvvM3bsWEnE\nwikZSilTXjggIECFh4eb8toimRw6pBdlzZ5tWykL9CTel1/qbUqShMUT+uUXaNYMIiOtbWXL6uqo\njRvrHW9xcXHs3r2bChUqmBeoEE/AMIw9SqmAxO7JW0jxZKKidAKuXFmvgp4wwTYRZ8kC48bB339D\nw4aSiMUTmzVLD6hYEnHatDBzJuzerctdpkkDsbGxvPXWW1SuXJm//vrL3ICFSAYyTC0eTykIDdXz\nwatX69qDD/P314U6OnbUCVmIJ6SUXm4waJC1LWdOWLpUv/eziImJoWPHjsybN4+BAwdSsmRJu8cq\nRHKTZCz+W2ys/k04ahQkNqXg7q57v1266HKVMlcnnpJSelX0qFHWtlKldI2YAgWsbffu3eP1119n\n4cKFDB8+nE8++cT+wQqRAiQZi4Tu3IHvvoPRoxMuyAJ9dGG7dhAcDD4+9o9PuBSloGdPXRfGok4d\nWLQIHi4lvWjRIhYuXMiYMWPo1auXfQMVIgVJMhZW167pPSTjxsG5c7b3PD31Ke3vv6/3kgiRDOLi\noGtXvUXJolEjWLgQEqtgGRISgp+fH1WqVLFfkELYgYwrCr0neOBA8PODPn1sE7G3ty72e+KErrgg\niVgkkytXoEUL20TcsqXuEcdPxNHR0bRp04Z9+/ZhGIYkYuGSpGecmt28CRMnwhdfwOXLtvfy5NEV\n+N95BzJnNic+4bJ+/x1atYKICGtbmzYwZ45eimBx8+ZNGjVqRFhYGA0aNKBEiRL2D1YIO0hSz9gw\njLqGYRw2DOMfwzD6/cczrQ3DOGAYxn7DMOYnb5giWd25o7ckPfecXjUTPxHrQ2Dh+HF9SKwkYpGM\nlNJVUitXtk3EH3yglynET8TXr1+nXr16bN26le+++47XX3/d/gELYSeP7RkbhuEGTAZqAaeB3YZh\nrFBKHYj3jD/wMVBFKXXZMIycKRWweAb37umux5Ah8O+/tvcKF4bBg+G116RcpUgxQ4fqGRELLy+9\nh7hlS9vnrl69Sr169di1axfz588nODjYvoEKYWdJGaYuD/yjlIoAMAzjR6AJcCDeM28Dk5VSlwGU\nUheSO1DxDJSCxYt1L/iff2zv+frq4r9vvAEeHubEJ1KF8HD9PtCiVCm9UKtIkYTPpk2blqxZs7Jw\n4UKaNWtmvyCFMElSkrEvEL8bdRp4uP5cEQDDMH4F3IBBSql1yRKheDbHj+vlqmvX2rb7+OiFWe++\nq1dKC5GC7t6FTp301nWAatVg/XpIn972ucjISNzc3MiaNSurVq3CkApuIpVIrgVc7oA/UAPIB2wz\nDKOkUsrmAFvDMN4B3gHw8/NLppcWibp3T9eGHjLE9ribLFn0iunu3RNu4hQihXz+ua6QCjoBz5qV\nMBGfP3+ewMBAcuTIwZYtWyQRi1QlKcn4DJA/3nW++23xnQZ2KqXuAccNwziCTs674z+klPoG+Ab0\nQRFPG7R4jIgIPQn3xx/WNsPQlbKGDZNylcKu/v4bhg+3Xo8YkfAo67NnzxIYGMipU6eYMGGCJGKR\n6iRlNfVuwN8wjEKGYaQFQoAVDz2zDN0rxjCMHOhh6wiE/a1erY+3iZ+IS5eGHTt0bWlJxMKOYmL0\n8LSlnHmlSnpQJr5///2X6tWrc/r0adatW0fNmjXtH6gQJntsMlZKxQDdgPXAQeAnpdR+wzCGGIbR\n+P5j64EowzAOAFuAj5RSUSkVtEhEbKxeiNWwoa6mAPq4m9Gj9cqZ8uXNjU+kSiNHWsuap00L336b\ncLF+hw4duHDhAhs2bKBatWr2D1IIByDnGTu7K1d0yaLp02HXLmt7/vx6BXW5cubFJlK1efOgbVvr\n9YgRekH/wyIiIrh06RIBAYke8yqEy5DzjF1RWJguYZQ7N7z9tm0iDgrSJY4kEQuTbNigT9O0qFZN\n15CxOHToEP369SMuLo7ChQtLIhapniRjZ3P7tj6s4dVXdY/4zh3rPTc36N8f1q2DHDnMi1GkauHh\n0Ly5ni8GKFECli+3bmPft28fNWrUYNasWZw58/BaUCFSJ6lN7UwOHYKQEPjzT9v2MmX0kYavvaZ7\nykKY5J9/oH59XfYc9GzJ2rWQNau+/vPPPwkKCsLDw4PQ0FDy58//319MiFREkrEzuH1br3zp0wdu\n3bK2N2qkN3AWL25ebELc9++/eobk4kV9nTWrLuyRL5++3rNnD7Vq1SJjxoyEhobi7+9vXrBCOBhJ\nxo4sIkKfLzdzJkTFW5yeLp0u6NGli94/LITJzp2DmjXh5El9nT49rFoFxYpZn4mKiiJXrlysWbOG\nQoUKmROoEA5KkrEjioyEzp1h6VJdVzq+YsXgxx91YV8hHMDFi7pHbCl77uEBS5bok5lAl7jMkSMH\ntWvX5u+//8bdXX7tCPEwWcDlaPbv13uClyyxTcQFClg3bUoiFg7i8mWoXRsO3D82xs0NfvoJ6tbV\n12FhYRQuXJjFixcDSCIW4j/IvwxHsnq1XoR1/bq1rW5dfdBDvXpytKFwKDdv6sVae/fqa8OA77+H\npk319caNG2nSpAmFChWiSpUq5gUqhBOQZOwI4uL0HHDfvtbecMaMMH8+NG786P9XCBPcuwfBwbrK\nqsXMmXqxP8CaNWto3rw5L7zwAhs3bsTHx8ecQIVwEpKMzfbzz9CzJ+zZY20rUABWrJDhaOGQlNJL\nGlavtrZNmGAt8nH06FGaNm1KqVKl2LBhA9myZTMlTiGcicwZm+XECWjdGl55xTYRV6miq2lJIhYO\n6tNP9RGIFh9/bHv4g7+/P19//TWbNm2SRCxEEkkyNsOGDfDii7BwobXN01NXz9q8GXLmNC82IR5h\n/Hi9td3ijTesxyP++OOP/P777wC8+eabZJETwoRIMknG9rZzJzRrBtHR1raQEDh8GIYO1XuIhXBA\n48bBBx9Yrxs0gG++0Qu3Zs+eTZs2bRgxYoR5AQrhxGTO2J4OHNDLTy1VtPz84IcfrBsyhXBQI0fa\nnrhUqZLewuTuDt988w2dO3emVq1afPfdd+YFKYQTk56xvZw6BXXqwKVL+jpHDj1cLYlYODClYNAg\n20Rctao+iyRDBpg0aRKdO3emQYMGrFixggwZMpgWqxDOTJKxPURE6MoIp0/r60yZdPX8okXNjUuI\nR4iJgY8+gsGDrW01a+pEnDkzxMXFsX79epo2bcqSJUvw9PQ0L1ghnJwMU6eku3f1/uEhQ/RhDwBp\n08KyZSDntwoH9r//6fozYWHWtrp1dWG49OkhOjqa9OnTs3DhQtzc3PCwnI8ohHgq0jNOKb/8Ai+/\nDJ98Yk3Ebm4wbx4EBpobmxCPEBamT+WMn4ibNNHvIT09FYMGDaJy5cpcvXoVT09PScRCJANJxsnt\n5k296bJaNV1n2uKll2D7dmjZ0rzYhHiMceP0e8X//U9fG4Yepl68GNKmVXz66acMHjyYl156iUyZ\nMpkbrBAuRIapk9Mvv+gyRMeOWdsyZtRblrp310tPhXBQS5bAhx9ar318dEXWoCBQStG7d2/Gjh1L\n586dmTJlCmnSyHt5IZKL/GtKDrdvQ69euppW/ETcsCEcPKjLXUoiFg7s2DFdwMOiUiX44w+diAFG\njBjB2LFj6d69O19//bUkYiGSmWSIZ/W//+ljanbutLZ5e+tSRe3b63E+IRzY7dvQqhVcu6avCxbU\ndaezZrU+0759e9zc3Ojbty+G/EwLkezk7e2z+P13KFfONhHXqQP79kGHDpKIhVP44APdCwa92H/h\nQp2IY2NjmTFjBrGxseTPn59+/fpJIhYihUgyflqLF+vqB5a9w2nS6NUva9dCvnzmxiZEEs2bB9Om\nWa/HjdO77mJiYmjfvj1vv/02q1atMi9AIVIJGaZ+UrGxennp0KHWNm9vXRuwdm3z4hLiCf39tz4K\n0SI4GN57D+7du8frr7/OwoULGTFiBE2aNDEvSCFSCUnGTyIqCtq00WUsLZ5/HlauhBdeMC8uIZ7Q\npUt6qcPNm/q6SBGYPh3u3r1DSEgIy5Yt48svv+TD+MurhRApRoapkyo8XBfxiJ+IAwP1fLEkYuFE\nYmN1da2ICH2dMaPe1uTlBQcOHGDDhg1MnDhRErEQdiQ946SYMQO6dtXlLS0+/lgPVbu5mReXEE/h\n009t31N+9x288EIs4EaZMmX4559/yJMnj2nxCZEaSc/4UaKj4c034e23rYk4c2ZdF3DECEnEwuks\nWACjRlmvP/0Uate+Qa1atZh2fyWXJGIh7E+S8X85fhyqVIGZM61tJUvCnj26UK8QTuavv6BTJ+t1\n/frQq9c16taty9atW/Hy8jIvOCFSOUnGiVm+HMqWtW6+BGjbFnbs0Au2hHAyUVF6wdatW/ra3x+m\nTLlCvXq12blzJz/++CNt2rQxN0ghUjFJxvFdvqyrZjVtqj8H8PCAyZP1xJocnC6cUEyMXrB1/Li+\nzpQJfvrpDi1aBPH777+zaNEiWrVqZW6QQqRykowt1q6FEiVg7lxrm68vbNsGXbpINS3htD7+GDZu\ntF7PnQsvvZSO1157jWXLlsk+YiEcgKymBvjsM9siHgCvvw4TJkC2bObEJEQy+OEHGDPGev3hh//D\n1/dfoBy9evUyLS4hhC3pGX//vW0i9vHRmy6//14SsXBqq1fbLtgKCjrDqlXVadasGbdv3zYvMCFE\nAqm7Z/z773rbkkWtWrpYr4+PeTEJkQxmzdI/2rGx+rpw4VMcO1aTyMgLrF27Fk9PT3MDFELYSL09\n44sXoVkzfX4cQLFisGiRJGLh1JSCzz/XPWJLIvb1Pc6dO69w6VIkGzdupEqVKuYGKYRIIHUm43v3\noHVrOHVKX3t760IemTObG5cQzyAmBnr0gE8+sbaVLg1BQWOIjr7O5s2bqVChgnkBCiH+U+obplYK\nevaEsDB9bRh6aLpIEVPDEuJZnDsHISF68b/Fq6/C0qWQPv04Tp78AH9/f/MCFEI8UurrGY8YofcN\nWwwbBg0amBePEM8oNBReesk2Edepsw+lArl79yJp06aVRCyEg0tdyXjaNOjf33rdurXehCmEE4qN\nheHD9brDCxd0W5o00KXLXsLDa3DkyCEuW4rXCCEcWupJxosW6ZPTLYKCdFUtKeYhnNDx43oYun9/\niIvTbTlzwqRJ4fzwQ00yZMjA1q1bKSLTL0I4hdSRjDdv1kU8lNLXAQF6L3G6dObGJcQTUkqfXVKq\nFPz8s7W9WjWYPTucfv0C8fb2Ztu2bTwvddSFcBqun4z379dbmCxHIBYtCmvW6JPUhXAiV69C8+b6\nVM8bN3SbmxsMHKjnjcuUyUe1atXYtm0bBQsWNDVWIcSTce3V1BcvQqNGcP26vvb11aeqy15i4WTO\nnYN69eDPP61tRYroOtNp0+4FSpA7d25WrVplWoxCiKfnuj3jO3d0N8JyVE3GjLo+oJ+fuXEJ8YQO\nH4ZKlWwTcdeu+oTPy5fXU6lSJQYOHGhegEKIZ+aayVgpePdd+OUXfW0YMH++roAghBPZuROqVIGT\nJ/W1m5sudTlpEoSGrqJx48a88MIL9OzZ09xAhRDPxDWT8dixMHu29XrkSGjc2LRwhHhSt27pLfE1\na0JUlG7LkAFWrICOHWHp0qU0b96cUqVKsXnzZnLkyGFqvEKIZ2MoywpjOwsICFDh4eHJ/4Xv3NFz\nwpZ54o4d9fJT2cIknEBcnD4w7NNP4fRpa3v27HqWpUIFuHr1KoUKFaJo0aKsW7cOb29v8wIWQiSZ\nYRh7lFIBid1zvQVcly5ZE3GWLDB1qiRi4RT++QeCg/VhYvEVLw6LF+uNAADe3t5s2LCBokWL4iW7\nAoRwCa43TH3njvXzzJllL7FwCgcPwiuv2CbiXLl00bi9e3UinjlzJuPHjwcgICBAErEQLsS1k7Ek\nYuEE/voLqlfX25cAPD1hwAA4ehTeeQfc3WHq1Km8+eabrF27lljL2YhCCJfhesPUkoyFE9mzB2rX\n1rMrAJky6bnhV16xPjNhwgTef/99GjRowKJFi3BzczMnWCFEipGesRAmCQ2FwEBrIs6cWdekiZ+I\nx4wZw/vvv0+zZs1YsmQJnp6e5gQrhEhRkoyFsLO4OH1yZ61ausQlQNasuoR6pUq2z6ZLl47g4GAW\nLFhA2rRp7R+sEMIuJBkLYUeRkfr47AEDrKct5cqle8kB9zc8KKU4ceIEAN27d+eHH37Aw8PDnICF\nEHYhyVgIO/n7b3j5ZVi3ztpWrZpeQf3SS/paKcXHH39MiRIlOHz4MACGbM0TwuVJMhbCDs6c0Qc9\n/Puvta1vX90jzptXXyul6NWrF6NGjaJdu3b4+/ubE6wQwu5kNbUQKezmTV2N9cwZfZ05s66y1aiR\n9Zm4uDh69OjB5MmT6dGjB1999ZX0iIVIRVy7ZywLXoTJ4uKgbVtrMQ83N11NK34iBpg9ezaTJ0+m\nd+/ekoiFSIWkZyxECurXD5Yts15PmQJBQQmfa9++PRkyZCA4OFgSsRCpkGv3jCUZCxNNmQKjR1uv\ne/XSFbUsYmJi6NOndvq6CAAAIABJREFUD+fOncPd3Z2QkBBJxEKkUq6XjO/etX4uyViYZPx46NrV\net24MYwaZb2+e/cuISEhjB49mtWrV9s/QCGEQ3G9ZCw9Y2GyL76ADz6wXpcvD/Pm6fligDt37tCq\nVSsWL17M2LFjeeutt8wJVAjhMGTOWIhkohQMHQoDB1rbqlSBNWt0zWmA6OhoWrRowdq1a5k8eTJd\nunQxJ1ghhEORZCxEMnk4EdeoAStXWhMxwK1btzh9+jTTp0+XHrEQ4gFJxkIkg6lTbRNx7dqwdClk\nyKCvb9y4Qdq0acmePTvh4eFSZ1oIYUPmjIV4RkuWQPzR5jp1YPlyayK+evUqderUoW3btiilJBEL\nIRKQZCzEM9i6Fdq00fPFAOXKwaJFYDnp8PLly9SuXZtdu3bRunVr2bokhEiUDFML8ZT+/huaNLH+\nyPn7w+rV1jniqKgoatWqxb59+1i0aBFNmjQxL1ghhENLUs/YMIy6hmEcNgzjH8Mw+j3iuRaGYSjD\nMAKSL8QnJMlY2MG5c1C/vvU84ty5Yf168PHR10opmjdvzoEDB1i+fLkkYiHEIz22Z2wYhhswGagF\nnAZ2G4axQil14KHnvID3gZ0pEWiSSTIWKezWLV3E4/Rpfe3lBWvXQqFC1mcMw+CLL77g+vXrBCVW\n/1IIIeJJSs+4PPCPUipCKXUX+BFI7G3+UGAUcDsZ43tykoxFCoqLg3btIDxcX7u56Tliy3nEZ86c\n4ZtvvgGgQoUKkoiFEEmSlGTsC8Q7hZXT99seMAzjZSC/Usr8un6SjEUK+uQTvXraYuJEvY0J4NSp\nU1SvXp3evXtz7tw5cwIUQjilZ15NbRhGGmAs0CsJz75jGEa4YRjhFy9efNaXTpwkY5FCZs60rS/9\nwQfw3nv684iICF555RUiIyPZuHEjefLkMSdIIYRTSkoyPgPkj3ed736bhRdQAggzDOMEUBFYkdgi\nLqXUN0qpAKVUgI9lpUtyk/OMRQpYs8b2xKWGDWHMGP350aNHqV69OtevXyc0NJQKFSqYE6QQwmkl\nZWvTbsDfMIxC6CQcArSx3FRKXQVyWK4NwwgDeiulwpM31CSSnrFIZjt3QqtWEBurr0uXhvnzrQc/\n7Nixg7t377JlyxZKlSplXqBCCKf12J6xUioG6AasBw4CPyml9huGMcQwjMYpHeATk2QsktGhQ9Cg\ngV5BDVCwoF457eWlT18CaNeuHUeOHJFELIR4akmaM1ZKrVFKFVFKPaeUGn6/7TOl1IpEnq1hWq8Y\nJBmLZHPmjC5tGRWlr3Pk0HuJ8+SBP/74A39/f7Zu3QqAt7e3iZEKIZyd65XDvHvX+rkkY/GUTp6E\nwEA4dUpfZ8igq2sVKQK7du2iZs2aGIZBvnz5zA1UCOESXC8ZS89YPKO//4bKleHwYX3t7g6LF0P5\n8rB9+3aCgoLImjUr27Zt47nnnjM3WCGES5BkLEQ827ZBtWpw9qy+TptWL9aqWxcOHjxI7dq1yZ07\nN9u2baNAgQLmBiuEcBmulYyVkmQsntrSpbqAh6XetKXMZatW+rpo0aL07NmTrVu3yvC0ECJZuVYy\njomxnmXn5mbdeyLEYyxfrpOu5b1c7ty6l1yzJmzatIlTp06RJk0ahg4dKgU9hBDJzrWSsfSKxVNY\ntw5at7buI/b3h+3bdb3plStX0qBBAz788ENzgxRCuDRJxiJVCwuDZs2si/Cffx62btUnMC1evJjm\nzZtTunRppk+fbmqcQgjXJslYpFrbt+uylrfvnzNWoABs3qz3Ef+/vTuPr6q4/z/+GpIgjShQpFSj\ngCKKiFb5IlpFCGGLrGUVEQShBQSqgKJWK+6CVbC2BVwgZRWQRYhRQHYtLaA1lB+bCIgsKiBQZEnM\nNr8/JulNIMtF7n7fz8eDR8+cnHPz6THkzcyZM2fWrFncfffdNGrUiGXLllGlSpXgFisiEU1hLFFp\n7Vo3Q/rUKde+7DIXxDVqQG5uLq+99hp33HEHS5Ys0YIeIuJ33qxNHT4UxuKF1atdj7ggiH/xCxfE\ntWtDXl4eMTExLF68mAoVKnDhhRcGtVYRiQ7qGUtU+egjuOsuTxBXr+6CuG5dmDBhAh06dODHH3+k\natWqCmIRCRiFsUSN1FRo395zj/iyy9xkrfr14c9//jNDhgyhXLnI+ishIuEhsn7zKIylGEeOQL9+\n0LGjZ9Z0jRruOeJrr4U//elPDB8+nC5dujBv3jwu0M+OiARY5IZx+fLBq0NCgrUwdaobgv773z37\nr7rKBXHt2jB27Fgee+wxevTowezZsymvnxsRCYLIDWP1bqLav/8NiYnQty98/71nf5cubiZ1wbLS\nzZo1Y8iQIcyYMYPY2Miazygi4UNhLBFl717o3RsaNnS93wI1asD778O8eVC9umXZsmUANGjQgL/9\n7W/EaOlUEQkihbFEhNxcGDXKvW94xgzP/thYePhh2LrVPc5krWX48OG0atWKJUuWBK9gEZFCImtc\nrmB2DiiMo0hmJtx7LyxYUHT/b34DY8a4SVrgniEeOnQoEydOZNiwYbRu3TrwxYqIFCOywlg946jz\n3/+6WdKFh6RvuQXGjnXvJS6Ql5fHgAEDmDx5Mo8++ihjxozBGBP4gkVEiqFhaglbBw64wC0cxMOG\nwbp1RYMY4J///CcpKSk89dRTCmIRCTnqGUtY+vRTNzN63z7PvldecfeHi8vZxo0b8/nnn3PTTTcF\nrkgRES+pZyxhxVoYPx7uuMMTxLGxMG0aPPJI0SDOysqiZ8+e/5uopSAWkVClMJawceIE9OwJQ4dC\ndrbbV6mSe2Spd++ix/7444907dqVWbNm8eWXXwa+WBGRc6BhagkL+/ZBq1awfbtn3803w9y5biWt\nwjIyMujcuTNLlixhwoQJPPDAA4EtVkTkHKlnLCHvu++gefOiQTxgAPzzn2cHcWZmJu3bt2fp0qVM\nmjRJQSwiYUE9Ywlp338PLVpAwUhzXBxMnnz2sHSB8uXLc80119CnTx96l3SQiEiIURhLyPrvf93Q\n9JYtrh0TA3PmQKdOZx97/Phxjh07Rq1atZgwYUJgCxUROU8KYwlJJ07AXXdBerprGwPTpxcfxMeO\nHaN169YcO3aMLVu26M1LIhJ2FMYSck6fdutIr1vn2Td5Mtxzz9nHfv/997Rq1YotW7Ywb948BbGI\nhKXIDWP9Ug5LmZluTenCq2qNHw/333/2sYcOHaJFixZ8+eWXLFq0iOTk5MAVKiLiQ5EbxuoZh52s\nLOjWDfLfbgjAq6/C4MHFHz9y5Eh27txJWloazZs3D0yRIiJ+oEebJCTk5ECvXpCW5tn3/PNuecuS\nvP7666xatUpBLCJhT2EsIWHkSLeAR4E//AGefPLs4/bs2UO/fv3IyMigcuXK3HrrrYErUkTETxTG\nEnTTp8Of/+xpDxsGL7549gsfdu3aRdOmTXnvvffYvXt3YIsUEfGjyArjrCzPtsI4LHz+uVtNq0Cn\nTu5dxGcG8RdffEGTJk04deoUK1eu5Prrrw9soSIifqQJXBI0hw65mdOZma5drx5MnQrlzvgn4tat\nW0lKSsJay6pVq7jhhhsCX6yIiB9FVs9YYRw2fvgB7r7b8xrESpVg4UK46KKzj83Ly6N69eqsXr1a\nQSwiEUk9YwmozZthwgR3n/jkSbfPGHjnHahTp+ix+/fvJyEhgfr165Oenk65M7vMIiIRIrJ+uymM\nQ9aXX0JSEtxwA0yc6AlicI8wtWlT9Pj169dTv359xo0bB6AgFpGIpp6x+N2GDdC2rXsDU2H16sEj\nj0DfvkX3r127lrvuuotq1arRtWvXgNUpIhIskRPG1iqMQ9DixdC1q1tvGiA21s2YHjIEmjQ5e9b0\n6tWradeuHQkJCaxcuZKEhITAFy0iEmCRE8bZ2Z7t2Nizp+RKwE2bBv37u9W1AKpWhQ8+gJLW6Thy\n5AgdOnSgZs2arFixgl/+8peBK1ZEJIgiJ7HUKw4Z1sILL0CfPp4grlkT1q4tOYgBqlatysyZM1m9\nerWCWESiSuT0jBXGISEjw/WGZ83y7LvxRjdcfdllxZ+zaNEiADp27Ej79u0DUKWISGhRz1h85ptv\noGnTokGclOReh1hSEM+bN4+uXbsyduxYrLWBKVREJMQojMUnNm2CW26BTz/17HvgAViyxC3oUZxZ\ns2bRo0cPbr31VtLS0jBnzuYSEYkSkRnG5csHr44otHs3tGrlesYAMTEwfrxb3CMurvhzpk6dSq9e\nvWjcuDFLlizh4osvDlzBIiIhRveM5bwcOgStW8PBg65dqRLMmwctWpR+3saNG0lKSmLRokXEx8f7\nv1ARkRCmMJaf7MQJt3LWzp2uXaECpKVB48Yln3P8+HEqVarEuHHjyMrK4gL9txIRidBhav2C97us\nLOjSBf79b9cuVw5mzy49iF977TXq1avH3r17McYoiEVE8imM5Zzk5MDMmXDTTbBsmWf/G29Ax44l\nn/fyyy8zYsQIbr/9di699FL/FyoiEkYiZ5g6K8uzrTD2uexst6LW6NGwa1fRrz37LPzudyWf+/zz\nzzNq1Cjuuecepk2bRmxs5PzYiYj4QuT8VlTP2G/27oXu3WH9+qL7L7oInnwSHn205HMnT57MqFGj\n6NOnD5MnTyYmJsa/xYqIhCGFsZRq8WLo1QuOHvXsq1IFhg2D3//ebZeme/fuHD16lIcfflivQRQR\nKUHk/HZUGPtUTo7r9bZp4wni2Fg3JP311zBqVMlBbK3lr3/9KydPnuSiiy5i5MiRCmIRkVKoZyxn\nOXYM7r676ASthASYMwfuuKP0c/Py8hgyZAhvvPEGcXFxDBo0yL/FiohEAIWxFLFjB7Rv7/63QMuW\nbgZ1tWqln5ubm8uAAQNISUnh8ccfZ+DAgf4tVkQkQkTO2KHC+LwtX+5ecVg4iEeNcveNywrinJwc\n+vbtS0pKCk8//TQvvfSS1poWEfGSesYCuLWkH3oIcnNd+2c/g6lToVs3787/9ttvWb58OS+++CJP\nPPGE/woVEYlACuMol53tQnjiRM++hARYtAj+7/+8OT+b2NhYrrjiCrZs2cLPf/5z/xUrIhKhNEwd\nxY4eheTkokF8yy2wYYN3QZyZmUnnzp0ZOXIkgIJYROQnUhhHqe3b3f3hlSs9+3r0gDVr4LLLyj4/\nIyODjh07kpaWRp06dfxXqIhIFIjMMNb7jEu1ZIkL4oK3LQE8/zy88467V1yWU6dO0bZtW5YtW0ZK\nSopmTYuInCfdM44i1sJf/gIjRkBentsXH+/WnO7SxdvPsHTs2JE1a9Ywbdo0evXq5b+CRUSihMI4\nSmRlwdCh8Pbbnn2XXw6pqXDzzd5/jjGGwYMHM2DAALp37+77QkVEopDCOAocPw6dOsGqVZ59t94K\nCxfCL3/p3WccPXqUDRs2kJycTOfOnf1TqIhIlFIYR7hvvoG77oJNmzz77r0XJk2CChW8+4zvv/+e\nli1b8uWXX/LVV19RrawVQERE5JxE5gQuhTEAX3wBt99eNIhfeAGmT/c+iA8ePEhiYiLbt29nwYIF\nCmIRET+InJ5xVpZnW2HMhg3ujUtHjrh2TAxMngx9+nj/Gd988w3Nmzdn7969fPDBByQlJfmnWBGR\nKBc5Yaye8f/s2OFe7vDDD64dHw/z5rnh6nMxa9Ys9u/fz5IlS7jzzjt9X6iIiABeDlMbY5KNMV8Y\nY3YaYx4v5usjjDFbjTGbjDErjDE1fV9qGRTGAGRkuPWkC4K4alW3sMe5BLG1FoARI0awadMmBbGI\niJ+VGcbGmBhgPHAXUA+4xxhT74zD0oGG1tobgXnAn3xdaJkUxgA8+KDnHvEFF8BHH7mZ097auXMn\njRo1Ytu2bRhjuPLKK/1TqIiI/I83PeNGwE5r7W5rbRYwG+hY+ABr7Spr7en85jrgct+W6QWFMdOm\nuVnSBV5/HRo08P787du306RJE/bs2cOPha+niIj4lTdhnADsK9Ten7+vJP2BxedT1E8S5WG8eTMM\nGuRp33svDBhwLudvJjExkdzcXFatWsVNN93k+yJFRKRYPp3AZYzpBTQEmpbw9QHAAIAaNWr48ltH\ndRh//bVbzjIjw7Wvuw7eeAOM8e787du306xZM+Li4li5ciV169b1X7EiInIWb3rGB4ArCrUvz99X\nhDGmBfAk0MFaW+wYp7X2LWttQ2ttQ58/rxqlYbx2rXvt4Y4drl0wc7piRe8/o0aNGrRu3Zo1a9Yo\niEVEgsCbMP4UqGOMudIYUx7oAaQWPsAYczPwJi6ID/m+TC9EYRhPnQpJSXD4sGvHxbn7xvXOnF5X\ngvT0dI4fP058fDwzZszQqxBFRIKkzDC21uYAQ4GlwDbgXWvtFmPMc8aYDvmHvQJUBOYaYzYaY1JL\n+Dj/sLbooh8R/gpFa+Hxx6FvX8//7WrV3NrT3r596R//+AdNmjRhyJAhfqtTRES849U9Y2vth8CH\nZ+wbVWi7hY/rOjeFgzguDspFziqfxXnpJXj5ZU/7hhvc25dq1fLu/NWrV9O2bVuuuOIKXi78QSIi\nEhSRkVqFh6gjvFf87rvwxz962u3bu/vG3gbxsmXLaNOmDbVq1WL16tUkJJQ2MV5ERAIhMpbDjJL7\nxevXF11bulkzN1nL239/ZGdnM3jwYOrUqcPy5cv10gcRkRChMA4TX38NHTtCZqZrX3MNzJ9/bgMB\ncXFxLFmyhMqVK1O1alX/FCoiIucs8oapIzCMv/sO2rWDgwdd++c/hw8+gCpVvDt/7ty5PPjgg1hr\nqV27toJYRCTEKIxDXHo6NGrkVtgCNz/tvffg6qu9O3/mzJn06NGD9PR0MgpWBRERkZCiMA5hCxZA\n48awL38x0nLlICUFmjTx7vwpU6bQu3dvmjZtyuLFi4mPj/dfsSIi8pMpjEOQtfDii+6Z4dP5r9+4\n+GI3NN2rl3efMWnSJO6//35atGhBWloaFc9lSS4REQmoyAjjws8ZR0AYT5hQ9PGl2rVh3TpITvb+\nMy699FI6depEamqqesQiIiEuMsI4gnrGn34Kw4d72omJ7pGm667z7vxt27YB0LZtWxYsWECFChV8\nX6SIiPiUwjiEHDsG3btDdrZr33wzLF4M3k5+Hj16NPXr1+fjjz/2X5EiIuJzCuMQYa1b0GPPHteu\nVAnmzgVvOrbWWp599lmeeOIJ7rnnHm6//Xa/1ioiIr6lRT9CxKuvwvvve9p//7u7V1wWay1PPvkk\no0ePpm/fvkyaNImYmBj/FSoiIj6nnnEImD8f/vAHT3vECOjUybtzly1bxujRoxk4cCCTJ09WEIuI\nhCH1jINswgQYOtQNUwP8+tcwZoz357ds2ZL333+ftm3bYozxT5EiIuJX6hkHibXu8aUhQzxBfO21\n7j5xXFzp5+bl5TFy5Ej+85//YIyhXbt2CmIRkTCmnnEQ5OTAwIFuNa0Ct94KaWlwySWln5ubm8tv\nf/tbpkyZQpUqVfjVr37l32JFRMTv1DMOMGvhoYeKBnGbNrBiRdlBnJOTw3333ceUKVN45pln+EPh\nG80iIhK2Iq9nfC7vFAyCv/7V3Scu0KcPvP122UPT2dnZ3HvvvcydO5eXXnpJQSwiEkEiL4xDuGf8\nwQdFV9e6+27XQy7nxfhEXl4eJ06cYOzYsYwYMcJ/RYqISMApjANk0ybo0QPy8lz7ttvcs8RlBXFm\nZiYZGRlUqVKFtLQ0PbokIhKBdM84AL79Ftq1g5MnXbtmTVi4EH72s9LPO336NB06dCA5OZmcnBwF\nsYhIhFLP2M8OH4YWLTzvJL7oIjdrunr10s87efIk7du3Z82aNaSkpBAbGxn/qURE5GyR8Rs+RMP4\n2DFo1Qq2bnXtmBh4912oX7/083744QfatGnDv/71L2bMmEHPnj39X6yIiASNwthPfvjBvX9440bX\nLlcOZs707p3EAwcOZP369cyePZtu3br5t1AREQm6yLhnnJXl2Q6BMD51yt0j3rDBsy8lxc2e9saY\nMWNYuHChglhEJEpERhiHUM84Nxd69oRPPvHsmzjRPU9cmsOHD/Pcc8+Rl5dHzZo1adu2rX8LFRGR\nkKEw9rHHHoPUVE977FgYNKj0c7777jsSExMZPXo0WwtuMIuISNTQPWMfeustF74FHnnEvQ6xNAcO\nHCApKYn9+/fz4YcfUr+s2V0iIhJxFMY+snw5DB7saf/mN2W/CnHv3r0kJSVx6NAhli5dSuPGjf1b\npIiIhCSFsQ9s2wZdu7r7xQANGsCMGe5RptLs3r2bU6dO8dFHH3Hbbbf5v1AREQlJCuPzdPgwtG0L\nx4+79mWXuXvGF15Y8jknT56kYsWKJCYmsmvXLuLj4wNTrIiIhCRN4DoPmZnQqRN89ZVrx8e71bUS\nEko+Z9u2bdStW5fp06fnn6MgFhGJdgrjn8ha6N8f1q51bWNg1iy4+eaSz9m8eTOJiYnk5OTQoEGD\ngNQpIiKhL/LCOEDvM37+eXjnHU/71VehQ4eSj9+4cSOJiYnExsayZs0arr/+ev8XKSIiYSHywjgA\nPeN334Wnn/a0Bwwo+p7iMx08eJCkpCTi4+NZs2YN1157rd9rFBGR8BH+YZyXB9nZnrafe8a7d8Nv\nf+tpt2gBf/ubG6YuSfXq1Xnuuef4+OOPufrqq/1an4iIhJ/wn01deF3quDj3RgY/yc52S12eOOHa\ntWvD3Lnu2xbnk08+oUKFCtxyyy0MHTrUb3WJiEh4C/+ecQCHqJ99Ftavd9uxsW7CVuXKxR+7YsUK\nkpOTGTZsGNZav9YlIiLhTWHspdWr4aWXPO0XXoBbbin+2KVLl9KuXTuuuuoqFixYgCltDFtERKKe\nwtgLR45Ar17ucSaA5s1h5Mjij01LS6NDhw7UrVuXVatWUb16db/UJCIikUNhXAZrYeBAOHDAtatW\nhWnTSr41PXXqVG688UZWrFjBJZdc4vN6REQk8kTWBC4/hPHMmTB/vqedkuKWvDxTTk4OsbGxzJgx\ng8zMTCpVquTzWkREJDKFf8/4uutcIJ84AevW+fSj9+2DwpOgBw4sfmGPGTNm0KhRI44cOcIFF1yg\nIBYRkXMS/mFsjHu2qGJF8GEI5uVBv36eF0BcdZVbZetMKSkp3HfffVSuXJkKFSr47PuLiEj0CP8w\n9pOJE907isHl/bRpLu8Le/PNN+nfvz8tW7YkLS2NC0t7VZOIiEgJFMbF2LGj6GzpRx+FO+4oesy0\nadMYNGgQbdu2ZdGiRXr7koiI/GQK4zNs3gytW0NGhmvfcINb7ONMzZs358EHH2TBggUanhYRkfOi\nMC4kLQ1+/WvYs8e14+Jg+vSik7QXLlxIbm4uCQkJvP7665QP0FuiREQkcimMcc8Sv/KKmyl98qTb\nV7EivPce/OpXBcdYnnnmGTp16kRKSkrwihURkYgT/s8Z+8Ajj8C4cZ52rVqQmuqGqMEF8RNPPMGY\nMWO4//776devX1DqFBGRyBT1PePFi4sG8Z13woYNRYP44YcfZsyYMQwaNIhJkyYRExMTnGJFRCQi\nRXUYHzkC/ft72u3auceZqlXz7Nu5cydvvvkmDz74IBMmTKCcH1/RKCIi0Smqh6mHDIFvv3Xb1au7\npS4L5mNZazHGUKdOHTZu3MjVV1+tty+JiIhfRG03b/ZsmDPH0377bU+PODc3l379+jFhwgQA6tSp\noyAWERG/icowPnAABg/2tPv3h/bt3XZOTg69e/dmypQpHDlyJDgFiohIVIm6YeqsLOjdG44dc+1a\ntTwTuLKysujZsyfz589nzJgxPPbYY0GrU0REokdUhXFeHtx/P6xa5drGwJQpcPHFkJeXR7du3UhN\nTWXcuHEMHz48qLWKiEj0iKowHjkS3nnH0372WWja1G2XK1eOpk2b0qpVK4YMGRKcAkVEJCpFTRi/\n+mrR54kHDoQ//hFOnz7Njh07uOmmmxgxYkTwChQRkagVFRO4Zswo+hamTp1g/Hg4deokbdq0oVmz\nZhwruIksIiISYBHfM969GwYM8LTvvNMNVZ869QNt2rRh3bp1TJ8+nSpVqgSvSBERiWoRHcbWwgMP\neF6HeN11bs3pjIxjJCcn8/nnnzNnzhy6dOkS3EJFRCSqRfQw9ezZ8NFHbrtg5nTlyjB27FjS09OZ\nP3++glhERILOWGuD8o0bNmxoP/vsM799/tGjrid86JBr//738Je/uO3s7GzS09Np1KiR376/iIhI\nYcaYf1trGxb3tYjtGT/2mCeIExJg6NBv6dy5MwcPHiQuLk5BLCIiISMi7xl/8glMmuRpP/30ftq1\nS+Kbb75h165dVK9ePXjFiYiInCHiwjgryz1DXKBly68ZMyaJw4cPs3TpUm6//fbgFSciIlKMiAvj\n11+Hbdvcdnz8V2zdmsipUz+wfPlyDU2LiEhIiqh7xvv3uyUuCzz6aDy1al3BihUrFMQiIhKyIiqM\nH34YTp0C+Irrr8/miSeq88knn9CgQYNglyYiIlIir8LYGJNsjPnCGLPTGPN4MV+/wBgzJ//r640x\ntXxdaFmWL4d33wX4f8Ct1KkzjLg4MMYEuhQREZFzUmYYG2NigPHAXUA94B5jTL0zDusPHLPWXg28\nBrzs60JLk5UFQ4cCpAPNiI8vz8svPxTIEkRERH4yb3rGjYCd1trd1tosYDbQ8YxjOgJT87fnAc1N\nALukr70GX3zxKZCEMReyYsUarrnmmkB9exERkfPiTRgnAPsKtffn7yv2GGttDnAcqOqLAsuybx88\n++yPQGegCk899TG33VY7EN9aRETEJwL6aJMxZgAwAKBGjRo++cwFCyAj4wJgLnXrXs5TT13uk88V\nEREJFG96xgeAKwq1L8/fV+wxxphYoBJw5MwPsta+Za1taK1tWK1atZ9W8RkeegiWLYO6dW/jrbcu\nJzbinpwWEZFI5010fQrUMcZciQvdHkDPM45JBfoA/wK6AittAN9A0aIFbN4MMTGB+o4iIiK+U2YY\nW2tzjDFDgaWEOmlCAAAEy0lEQVRADJBird1ijHkO+MxamwpMBqYbY3YCR3GBHVAKYhERCVdeDepa\naz8EPjxj36hC25lAN9+WJiIiEh0iagUuERGRcKQwFhERCTKFsYiISJApjEVERIJMYSwiIhJkCmMR\nEZEgUxiLiIgEmcJYREQkyBTGIiIiQaYwFhERCTKFsYiISJApjEVERIJMYSwiIhJkCmMREZEgUxiL\niIgEmbHWBucbG3MY+NqHH3kJ8L0PPy9a6TqeP13D86dreP50Dc+fr69hTWttteK+ELQw9jVjzGfW\n2obBriPc6TqeP13D86dreP50Dc9fIK+hhqlFRESCTGEsIiISZJEUxm8Fu4AIoet4/nQNz5+u4fnT\nNTx/AbuGEXPPWEREJFxFUs9YREQkLIVdGBtjko0xXxhjdhpjHi/m6xcYY+bkf329MaZW4KsMbV5c\nwxHGmK3GmE3GmBXGmJrBqDOUlXUNCx3XxRhjjTGa1VoMb66jMaZ7/s/jFmPMO4GuMdR58fe5hjFm\nlTEmPf/vdJtg1BmqjDEpxphDxpjNJXzdGGP+kn99NxljGvilEGtt2PwBYoBdwFVAeeA/QL0zjhkM\nvJG/3QOYE+y6Q+mPl9ewGRCfv/2AruG5X8P84y4CPgbWAQ2DXXeo/fHyZ7EOkA5UyW//Ith1h9If\nL6/hW8AD+dv1gD3BrjuU/gBNgAbA5hK+3gZYDBjgNmC9P+oIt55xI2CntXa3tTYLmA10POOYjsDU\n/O15QHNjjAlgjaGuzGtorV1lrT2d31wHXB7gGkOdNz+HAM8DLwOZgSwujHhzHX8HjLfWHgOw1h4K\ncI2hzptraIGL87crAd8EsL6QZ639GDhayiEdgWnWWQdUNsZc6us6wi2ME4B9hdr78/cVe4y1Ngc4\nDlQNSHXhwZtrWFh/3L8KxaPMa5g/lHWFtfaDQBYWZrz5WbwGuMYYs9YYs84Ykxyw6sKDN9fwGaCX\nMWY/8CHw+8CUFjHO9XfmTxLr6w+UyGGM6QU0BJoGu5ZwYowpB4wD+ga5lEgQixuqTsSN0HxsjLnB\nWvvfoFYVXu4Bplhrxxpjfg1MN8bUt9bmBbsw8Qi3nvEB4IpC7cvz9xV7jDEmFjcscyQg1YUHb64h\nxpgWwJNAB2vtjwGqLVyUdQ0vAuoDq40xe3D3mVI1iess3vws7gdSrbXZ1tqvgB24cBbHm2vYH3gX\nwFr7L6ACbs1l8Y5XvzPPV7iF8adAHWPMlcaY8rgJWqlnHJMK9Mnf7gqstPl34QXw4hoaY24G3sQF\nse7Rna3Ua2itPW6tvcRaW8taWwt3372Dtfaz4JQbsrz5+7wQ1yvGGHMJbth6dyCLDHHeXMO9QHMA\nY8x1uDA+HNAqw1sqcF/+rOrbgOPW2m99/U3CapjaWptjjBkKLMXNIkyx1m4xxjwHfGatTQUm44Zh\nduJuyvcIXsWhx8tr+ApQEZibP/dtr7W2Q9CKDjFeXkMpg5fXcSnQyhizFcgFRlprNdKVz8tr+DDw\ntjFmOG4yV191UDyMMbNw/+C7JP+++tNAHIC19g3cffY2wE7gNHC/X+rQfxMREZHgCrdhahERkYij\nMBYREQkyhbGIiEiQKYxFRESCTGEsIiISZApjERGRIFMYi4iIBJnCWEREJMj+P50h0JPWRc/hAAAA\nAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results = pd.concat(dfs)\n", + "import statsmodels.api as sm\n", + "thresh = 0.001 # POSSIBLE BUG? several very small pivots -- fine for pvalues\n", + "grid = np.linspace(0, 1, 101)\n", + "fig = plt.figure(figsize=(8, 8))\n", + "plt.plot(grid, sm.distributions.ECDF(results['pivot'][results['pivot'] > thresh])(grid), 'b-', linewidth=3, label='Pivot')\n", + "plt.plot(grid, sm.distributions.ECDF(results['pvalue'])(grid), 'r-', linewidth=3, label='P-value')\n", + "plt.plot([0, 1], [0, 1], 'k--')\n", + "plt.legend(fontsize=15);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-slideshow", + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/algorithms/ROSI.Rmd b/doc/source/algorithms/ROSI.Rmd new file mode 100644 index 000000000..b53c6a8c5 --- /dev/null +++ b/doc/source/algorithms/ROSI.Rmd @@ -0,0 +1,110 @@ +--- +jupyter: + jupytext: + cell_metadata_filter: all,-slideshow + formats: ipynb,Rmd + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.1' + jupytext_version: 1.1.1 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# Conditioning on less: ROSI + +Instead of conditioning on the active set and signs, +one can work in the full model and for each feature $j$ selected +construct p-values and confidence intervals +conditional only on the event $j$ was selected. +This is the approach of [Liu et al.](https://arxiv.org/abs/1801.09037), which +can be extended as ROSI (Relevant One-step Selective Inference) +beyond squared-error loss (described in forthcoming work, though +code is already available). + + +```{python} +import numpy as np, pandas as pd +import matplotlib.pyplot as plt +import statsmodels.api as sm +# %matplotlib inline + +from selectinf.tests.instance import gaussian_instance # to generate the data +from selectinf.algorithms.api import ROSI + +``` + +We will know generate some data from an OLS regression model and fit the LASSO +with a fixed value of $\lambda$. In the simulation world, we know the +true parameters, hence we can then return +pivots for each variable selected by the LASSO. These pivots should look +(marginally) like a draw from `np.random.sample`. This is the plot below. + +```{python collapsed=TRUE} +np.random.seed(0) # for replicability + +def simulate(n=500, + p=100, + s=5, + signal=(5, 10), + sigma=1): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0., + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + sigma_hat = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) / np.sqrt(n - p) + L = ROSI.gaussian(X, y, 2 * np.sqrt(n), sigma=sigma_hat) + soln = L.fit() + active_vars = soln != 0 + + if active_vars.sum() > 0: + projected_truth = np.linalg.pinv(X[:, active_vars]).dot(X.dot(truth)) + S = L.summary(truth=projected_truth) + S0 = L.summary() + + pivot = S['pval'] # these should be pivotal + pvalue = S0['pval'] + return pd.DataFrame({'pivot':pivot, + 'pvalue':pvalue}) +``` + +Let's take a look at what we get as a return value: + +```{python} +while True: + df = simulate() + if df is not None: + break +df.columns +``` + +```{python} +dfs = [] +for i in range(200): + df = simulate() + if df is not None: + dfs.append(df) +``` + +```{python} +results = pd.concat(dfs) +import statsmodels.api as sm +grid = np.linspace(0, 1, 101) +fig = plt.figure(figsize=(8, 8)) +plt.plot(grid, sm.distributions.ECDF(results['pivot'])(grid), 'b-', linewidth=3, label='Pivot') +plt.plot(grid, sm.distributions.ECDF(results['pvalue'])(grid), 'r-', linewidth=3, label='P-value') +plt.plot([0, 1], [0, 1], 'k--') +plt.legend(fontsize=15); +``` diff --git a/doc/source/algorithms/ROSI.ipynb b/doc/source/algorithms/ROSI.ipynb new file mode 100644 index 000000000..0ca401472 --- /dev/null +++ b/doc/source/algorithms/ROSI.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conditioning on less: ROSI\n", + "\n", + "Instead of conditioning on the active set and signs, \n", + "one can work in the full model and for each feature $j$ selected\n", + "construct p-values and confidence intervals\n", + "conditional only on the event $j$ was selected.\n", + "This is the approach of [Liu et al.](https://arxiv.org/abs/1801.09037), which\n", + "can be extended as ROSI (Relevant One-step Selective Inference)\n", + "beyond squared-error loss (described in forthcoming work, though\n", + "code is already available).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np, pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "%matplotlib inline\n", + "\n", + "from selectinf.tests.instance import gaussian_instance # to generate the data\n", + "from selectinf.algorithms.api import ROSI\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will know generate some data from an OLS regression model and fit the LASSO\n", + "with a fixed value of $\\lambda$. In the simulation world, we know the\n", + "true parameters, hence we can then return\n", + "pivots for each variable selected by the LASSO. These pivots should look\n", + "(marginally) like a draw from `np.random.sample`. This is the plot below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "np.random.seed(0) # for replicability\n", + "\n", + "def simulate(n=500, \n", + " p=100, \n", + " s=5, \n", + " signal=(5, 10), \n", + " sigma=1): \n", + "\n", + " # description of statistical problem\n", + "\n", + " X, y, truth = gaussian_instance(n=n,\n", + " p=p, \n", + " s=s,\n", + " equicorrelated=False,\n", + " rho=0., \n", + " sigma=sigma,\n", + " signal=signal,\n", + " random_signs=True,\n", + " scale=False)[:3]\n", + "\n", + " sigma_hat = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) / np.sqrt(n - p)\n", + " L = ROSI.gaussian(X, y, 2 * np.sqrt(n), sigma=sigma_hat)\n", + " soln = L.fit()\n", + " active_vars = soln != 0\n", + " \n", + " if active_vars.sum() > 0:\n", + " projected_truth = np.linalg.pinv(X[:, active_vars]).dot(X.dot(truth))\n", + " S = L.summary(truth=projected_truth)\n", + " S0 = L.summary()\n", + "\n", + " pivot = S['pval'] # these should be pivotal\n", + " pvalue = S0['pval']\n", + " return pd.DataFrame({'pivot':pivot,\n", + " 'pvalue':pvalue})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at what we get as a return value:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['pivot', 'pvalue'], dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "while True:\n", + " df = simulate()\n", + " if df is not None:\n", + " break\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dfs = []\n", + "for i in range(200):\n", + " df = simulate()\n", + " if df is not None:\n", + " dfs.append(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeMAAAHSCAYAAADfUaMwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzde3yO9R/H8de12cz5fByico7CnI/Z\nHEKI2CgSRUgnogMRUTlVkgoVyWEIySlnOklUv4SSYznkfDY7Xr8/vubebM7bfd27934+HnvY9b2v\nts8ye+/7vb4Hy7ZtRERExDk+ThcgIiKS3imMRUREHKYwFhERcZjCWERExGEKYxEREYcpjEVERByW\nwalPnDdvXrt48eJOfXoRERG32rx58zHbtvMl95pjYVy8eHE2bdrk1KcXERFxK8uy9l3tNQ1Ti4iI\nOExhLCIi4jCFsYiIiMMUxiIiIg5TGIuIiDjMsdnUN+LMmTMcOXKE6Ohop0uRa/Dz8yN//vxkz57d\n6VJERNIkjw3jM2fOcPjwYQIDA8mUKROWZTldkiTDtm0iIiI4cOAAgAJZROQWeOww9ZEjRwgMDCRz\n5swKYg9mWRaZM2cmMDCQI0eOOF2OiEia5LFhHB0dTaZMmZwuQ25QpkyZ9DhBROQWeWwYA+oRpyH6\nuxIRuXUeHcYiIiLpgcJYRETEYQrjVDRkyBAsy7r8VrhwYdq2bcuuXbsA6NKlC0FBQSn+eXfs2MGQ\nIUM4depUin9sERFJeR67tMlb5MiRg2XLlgGwe/duBg0aRHBwMFu3bmXQoEFERESk+OfcsWMHr7/+\nOl26dCFnzpwp/vFFRCRlXTeMLcv6FGgBHLFt+55kXreA94BmwAWgi23bv6R0oWlVhgwZqFGjBgA1\natSgWLFi1K1blyVLltCuXTuHqxMREU9wI8PUU4Cm13j9AaDkpbfuwIe3X5b3qlKlCgB79+5NNEy9\nZ88eLMti8eLFie6PjY2lYMGCDBw48HLb6tWrqV69OgEBARQoUIBevXpx7tw5ANauXcuDDz4IQIkS\nJbAsi+LFi7vhKxMR8R4XL8KuRdvd9vmuG8a2ba8HTlzjllbA57axAchpWVahlCrQ2+zduxeAggUL\nJmovUaIE1apVY/bs2Yna161bx+HDhwkLCwNg69atNG3alLx58/Lll1/y+uuvM2PGDB5++GEAKleu\nzOjRowGYN28eP/74I/Pnz0/lr0pExHtE7D7EhhJhlHiwPH9N3eCWz5kSz4wDgX8TXO+/1HYoBT52\nIp6wlNW2b/6/iYmJAcwz4169epEtWzZCQkJYtWpVovvCwsJ4/fXXiYyMJGPGjACEh4dTvnx57rnH\nPCEYNmwYd9xxBwsXLsTX1xeA3LlzExoayo8//kjNmjUpXbo0AJUqVVKvWETkRsXFEfX+x1x8YQBz\n4s5SHIju1oP/Gm6iYFG/VP3Ubp1NbVlWd8uyNlmWteno0aPu/NSOOX78OH5+fvj5+VG6dGl2795N\neHg4hQolHTxo3749Z86cuTzhKyYmhnnz5hEaGnr5no0bN/LQQw9dDmKAtm3bkiFDBr777rvU/4JE\nRLzR//5HbI1a+D/Xi4FxZ5kArAFiy99LwRwpP9H2SinRMz4AFE1wXeRSWxK2bU8EJgIEBQXdQh8z\n7cmRIwcrV67EsiwKFixI4cKFr7pbVWBgIHXq1CE8PJxWrVqxatUqjh07dnmIGuDQoUMUKFAg0X/n\n6+tLnjx5OHHiWk8TREQkiV27YMgQ7OnT8b009NkfuIMC+D0+g3s/beiWMlKiZ7wQ6GwZNYDTtm2n\n+BA1mCFip99uVoYMGQgKCqJKlSoEBgZed9vI0NBQvv76ayIiIggPD6dSpUqULFny8uuFChVKciBD\nbGwsx48fJ3fu3DdfoIhIerR/P/ToAWXKwBdfcMG2GQ1E4MdnDMYaupdH3RTEcANhbFnWTOBHoLRl\nWfsty+pmWdZTlmU9demWJcBuYCcwCeiVatWmA+3atSMiIoL58+czf/78RL1igOrVqzN//nxiY2Mv\nt82bN4+YmBjq1KkDgL+/PwAXL150X+EiImnBjh3w5JNw110wcSLExHAWsyxoAFCaz8kycggvDgpw\na1nXHaa2bbvDdV63gd4pVlE6lz9/fho0aEC/fv04deoU7du3T/T6wIEDqVSpEq1bt6Znz57s37+f\nAQMG0KRJE2rWrAlweQLXxx9/TFhYGJkzZ6ZChQpu/1pERDyCbcOGDTBmDMybl2iY8xRQm2xs4wIw\nnefGhPLCC+4vUdtheqCwsDAOHTpEjRo1ksyGLl++PEuXLuXIkSO0adOGgQMH0qFDB+bOnXv5njvu\nuIPRo0czb948ateufXndsYhIuhEdDStXQp8+cMcdUKsWfPlloiD+o0gQd1CKbVwE5jBggDNBDGDZ\nt/IgNAUEBQXZmzZtuurr27dvp2zZsm6sSG6X/s5ExFG2DT/+CNOmQXg4nDyZ/H3NmrGq6kuEvG4B\nDwLTCAtrwfTp4JOKXVTLsjbbtp3sgQTam1pERNK2Cxdg7FiYMsXMjk5OrlzQqhU8/zxLD5SmdeuM\nl17YQ716OZkyJXWD+HoUxiIiknbt3w8tW8KvvyZ9rWhReOghaN0a6tQBPz8WLDhImzaVsO0Xgccp\nUyYn8+dDxoxJ/3N3UhiLiEja9NNPJmj/+8/VliMHtGsHnTtD7dqJurtff/0vbdo0xLb/A+4mMBCW\nLgVPWBWqMBYRkbRnxgzo2hUiI811hgwwerRZOxyQdFnS4sV7aN26IbZ9AlhOgQI1Wb0aPGXHYIWx\niIikDUePwqxZZoLWzz+72nPnhrlz4f77k/3PVq48TcuW9YmLOwesIk+eIFauhFKl3FP2jVAYi4iI\n5zp2DBYtMsuSli2DSwfvXFa2LCxcCHffneQ/tW2YMAGefz4HcXEvAA3ImfM+VqyAS2fveAyFsYiI\neJbdu+Grr2DBAvjuO4iLS3qPnx906ADjxpnnxFc4fx7CwrayaNEFoCrwHLlymWfElSql+ldw0xTG\nIiLiLNuG336D+fNNAG/ZcvV7a9WCTp2gffurzrzauxcaNfofO3eGAPmB36lc2Ze5c6FEidT4Am6f\nwlhERJxz8CB07w6LFyf/umVBzZpm1nSbNmZP6WvYvBkaN97EiRONgSzAArp182X8+GTndXkMbYeZ\nioYMGYJlWZffChcuTNu2bdl1tUXpKcyyLMaPH++WzyUiclNsG6ZPNw9vrwzijBmheXOYNAkOHYLv\nv4cXX7xuEC9dCnXqbODEiWAgB35+65k0qSSTJ3t2EIN6xqkuR44cLFu2DIDdu3czaNAggoOD2bp1\nK1myZHG4OhERBxw+DD17mmHphEJDzRrhJk0ga9ab+pCffmo62LGxHwD5yJ59NV9/XYx69VKu7NSk\nME5lGTJkoEaNGgDUqFGDYsWKUbduXZYsWUK7du0crk5ExM3mzDFBfPy4q614cfjsM2jQ4JY+5Oef\nQ7duNmABkwkMPMXy5QUoVy4F6nUTDVO7WZUqVQDYu3dvsq+XKFGCF198MUl7u3btLp9XfP78eZ5+\n+mlKly5N5syZKVGiBL179+bMmTPX/NzFixenX79+idqmTJmCZVmcO3fuctuJEyfo3r07BQoUICAg\ngFq1avHTTz/dzJcpIpLY8eMQFmYmXiUM4h494PffbzmIDx6Enj2XAzWAY9x3X0Y2bkxbQQwKY7eL\nD+GCBQsm+3r79u2ZM2dOorZz586xePFiwsLCALhw4QKxsbEMHz6cpUuXMmzYMFavXp0iPe3IyEhC\nQkJYuXIlo0aNYsGCBeTLl4+QkBD+S7jlnIjIjTh2DD74AMqXNycpxStSBL75Bj76CLJlu+UP//DD\ni7lw4UEgkhIlbNatg8KFb79sd9MwtRvEXFqkvnv3bnr16kW2bNkICQlJ9t6wsDBGjhzJhg0bLg9v\nf/3110RFRV0O23z58vHhhx8m+vglSpSgTp06/PPPPxQrVuyWa/3iiy/4448/2Lp1KyVLlgQgJCSE\n0qVLM2bMGEaNGnXLH1tE0onISPj6a7NT1pIlSTfqePxxeOedZNcH34wBAxbw44/tgYrAcj75JDfZ\ns9/Wh3RM2gpjy3K6gkQHU9+I48eP4+fnd/m6WLFihIeHU7BgwcshDWbms6+vL5UqVaJUqVKEh4df\nDuPw8HDq169PgQIFLt8/bdo0xo4dy99//8358+cvt+/YseO2wnjlypVUqVKFEiVKJKqvfv36XOv8\naRER9u0zPd1PPjFbV16pUCGYOBFatLjtTzVr1iJGjmwHBAFLeeKJnFfbDTNN0DB1KsuRIwc///wz\nmzZtYv/+/ezdu5cHHniAdevW4efnd/ktODj48n8TGhrKnDlzsG2bM2fOsGzZsstD1ADz58+nc+fO\n1KxZkzlz5rBhwwbmX5qVePHixduq99ixY2zYsCFRbX5+fnz22Wf8+++/t/WxRcQLnTsH8+aZYwzv\nvBPeeitpENeoYYaqt29PkSAG+PrrykBH4BsKFsxJWh+0S1s94zQoQ4YMBAUFJWmvUqUKPyfY6Dxb\ngmcmoaGhDBs2jO+++449e/YQFxdHmzZtLr8+Z84cqlevzoQJEy63rVu37rq1BAQEEBUVlajt5MmT\nia5z585NUFBQomHweBmdPvBTRDzDiRMmgL/6ClascJ2clFDRomY4+tFH4dIjr5SwZs0azp2ry4wZ\nhYGpgMn5nDlT7FM4Im2F8U0OEXuybNmyJRvSAOXLl+eee+4hPDycPXv2EBISQp48eS6/HhERkSQY\np0+fft3PWaRIEbZv356obfny5Ymug4ODWb58OcWKFSN//vw3+uWISHrw778wdqwZar5wIfl7GjeG\nXr3Mph0ZUjZiJk6cSI8ePfDxGQWYlSEPPWQ25krr0lYYpyOhoaG89957nD59mkmTJiV6rVGjRvTu\n3Zvhw4dTvXp1lixZwqpVq677MR966CH69OnDiBEjqFq1Kl9++SVbt25NdE/nzp356KOPaNCgAf36\n9ePOO+/k+PHjbNy4kYIFC/L888+n6NcpIh7Ots0ek+PHmx2zrpyMBVChArRqZfaMTqVzCd9/fzzP\nPNMHaEZc3NMABAaaXrE3UBh7qLCwMAYNGkTGjBlp3bp1otd69OjB7t27ee+997h48SKNGjVixowZ\nlyd8XU337t3ZtWsX48aNIzIyks6dOzNw4EB69Ohx+Z6AgADWrFnDa6+9xuDBgzl8+DD58+enWrVq\ntGzZMlW+VhHxMNHRsG6d6+Sk/fuT3lOhAnTpYkL4OttU3q5Ro8bSv39foBUQDmSkfHmz/WWhQqn6\nqd3Gsh0a+g0KCrKvNTt3+/btlC1b1o0Vye3S35lIGmbbsHGjWY40a1bijTkSqlcPXnoJmjZ1ywqX\n2bMP0KFDaeLimgHTAT8aNDA7aaa158SWZW22bTvZ55PqGYuIpGfnzsGECWY50o4dyd+TK5eZBd2z\npzlByQ127jRnQyxYEAj8CJQFMhAWBlOmmLMkvInCWEQkPbp4ET7+GEaMgCNHkr4eGAht25ph6Lp1\nIcF+CakpOhqGD7cZNmwgcXGFgKeBCmTODIMGQf/+4OOFi3IVxiIi6YVtw59/wrJlZlb0lc+Cs2aF\nhx82E7EaNHB76m3ZAp072/z2Wz9gLPAUYNO5s8WIEeb3A2+lMBYR8WbR0TB3rjkzePVqcz7wlYoU\nMd3ORx+FzJndXmJMDIwaBa+9ZhMT8wwwHuhD9erv8f77FlWrur0kt1MYi4h4o/PnYfJkGDPGrA9O\nToEC8Mor5iDggAD31neJbcNjj8GMGTbQE/gYX9++vP32KJ57zsLX15Gy3M6jw9i2bSxP2I9arsup\nWfkicoXTp+Hdd2HcOLNT1pVy5oT77zebc3TqBFmyuL/GBKZOhRkzwJxFXJpChV5m1arhlC2bvn72\ne2wY+/n5ERERQWYHhkzk5kVERCQ6EENE3Oz8eRPAo0bBFdvcki8f9O5tZkTfdx+e0t3cuRN6944B\ndgDl6NLleSZNSvGNu9IEj/2S8+fPz4EDBwgMDCRTpkzqIXso27aJiIjgwIEDiU6VEhE3OX4cPv/c\nHNBw5azoEiXM+qAuXSBTJkfKu5roaAgLi+bChUeAZdx111988EGhdBnE4MFhnP3SoZQHDx4kOjra\n4WrkWvz8/ChQoMDlvzMRSWWRkWZC1rRp5s8rf0bedRcMGQJhYR7bzRw4MJLNm0OBr/DxGcOcOYWc\nmDvmMTzzb+mS7Nmz6we8iKRvMTGwZAn8+qtZlvTnn/DXXxARkfTeokXhtdfMjCgPfmy0YsVFRo5s\nCywB3uftt5+mUiWnq3KWR4exiEi6FRcH4eEweDD8/fe1761WzRxX2KWLY7Oib9ShQ9C27ThgKfAx\nwcHdeeEFp6tynsJYRMST2DYsXGjW/W7ZcvX7SpSARx4xa4NLl3ZffbchKsrsKXL27PNAFXLnDmbq\nVO/cUetmKYxFRDyBbcOKFTBwIPz8c+LXcuY0Q88VKkCZMiZ88+Z1ps5bdObMGapWfZYdO94CCuDj\nE0x4uHfvqnUzFMYiIk777jt49VVYvz5xe5Ys8Nxz0K9f2juiKIFTp05RuXJT9uzZDDwMNGfkSAgJ\ncboyz6EwFhFxwsWLMGeOOTFpw4bEr2XMCL16maMK8+d3pr4UcuLECWrVasyePb8Dc4HmdOyInhNf\nQWEsIuIutg2//w4zZ5ojC48dS/x6hgzwxBNmqNoLxm+PHj1K9eqN2LPnT2AB0Iz77oNJk9xyFHKa\nojAWEUlNcXFm+HnBAvO2b1/Se/z9oWNHM2nrzjvdX2MqsG0YP95mz54MwNdAI/LkgfnzHTmLwuMp\njEVEUkNkJEyfDiNHmnXBySlWDJ56Crp1S/PD0Qn9889hBg7MzbRp+YGNgA933GEmiRcv7nBxHkph\nLCKSks6fh48+MucFHzyY9PUcOaBZM7M7VvPmHrNPdEr5449/qFq1IRcv1gM+BXyoUwe+/NKrft9I\ncQpjEZGUsnat2Xxj797E7dmzmzXBDz0E9eubYWkvtGnTHmrXbkhU1EmgB2D+d3z4oZmTJlenMBYR\nuV0XLpiZz++/n7i9YEF4/nno0cP0iL3Y+vV/ExzckJiYC8AqoApvv23OqdBkretTGIuI3IqoKLND\n1s8/myHphFtW5soFw4ebbqGHb0+ZErZvjyU4uBUxMReB1fj43MukSdC1q9OVpR0KYxGRGxUdDR9/\nDF98Ab/9ZiZpXal5c5g4EQoXdn99Dli/Htq29SUm5lMgG35+5Zkxw2x7KTdOYSwicj22DUuXmp0q\nrjYzOnt2ePddc1hDOhmXHTz4N9544zvi4p4GapA5s1m61Lix05WlPQpjEZGrOXkSNm6Ed96Bb75J\n+nqJElC1qjk1qUOHdNMbjomBzp03MXNmYyAr0JkCBbLz1VdQvbrT1aVNCmMRETBLkn791TwDjn/b\nuTPpfdmzmx2yHn88zR3WkBIiIiAk5Ed++KEpkBtYw333mSAuVszp6tIuhbGIpF8XLsCoUWYR7Nat\nZresq/HxgSefhKFD0+2C2QsXoF699Wze3BwoCKzmoYeKMm2aOdNCbp3CWETSp6VLoXdv2LPn6vf4\n+UHFimbstUcP8346df48tGwJmzf/BRQBVvHSS4UZPlznEacEhbGIpC8HD5q1v7NnJ263LHNWcNWq\nrufAFSumi6VJ13PuHDRteobvv88OPAl0YtiwAAYOdLoy76EwFhHvFxkJixbBtGmwZIlZohQvd254\n6y2zPWW2bM7V6KFOn4ZatRaxbVtnYAlQgzffDOCll5yuzLsojEXEO128CKtXm7U2c+fCqVNJ7+nc\nGUaPhnz53F9fGnD0KFSvPp89e0KBe4FSjBoF/fo5XZn3URiLiPewbRO+M2eaZ8Lnzyd/X40aZoes\nhg3dW18a8u+/UL36bA4d6ghUBZbxzjs5eO45pyvzTgpjEfEOf/4JPXuawxqSU7w4PPoodOoEpUq5\ns7I0Z8cOqFfvew4f7gDUxrIWM3lyNm1vmYoUxiKStl28CG++aZ77RkUlfq1kSWjd2rzVqKFpvzfg\nhx/M/66jR2sAI8iQ4Wlmzsyi7S1TmcJYRNKubdvMsYQ7drjafH3hmWfMmuAyZdLN1pQpYepU6NZt\nGrGx9wNFyJx5APPmQZMmTlfm/fRrooikTevXQ+3aiYO4enXYvNmcolS2rIL4BsXGQv/+0KXLOGJj\nOwNvkzcvrFihIHYX9YxFJO2ZPds8+40fls6SxcyK7t5dQ9E3KSICQkPh669HAf2BhyhXbgyLF5vH\n7OIeCmMRSTtiY82hDS++6GorUMCsHa5c2bm60qjISGjbFpYuHQ4MBEJp3nwaM2f6acm1mymMRcTz\nHTsGn34KH32UePvK0qVh2TJ14W5BdLQ5aGrp0ghgNtCJfv0+5a23MuDr63R16Y/CWEQ804UL5qHl\n3LkwZ47pxiVUuzYsXGh20JKbEhsLnTvbzJ8fA2QC1vHKK9kYPlwp7BSFsYh4jqgoCA83pygtX24e\naF4pVy5zaMPgwdo3+hZERUH37jazZvUFdgJf0rdvTt54w+nK0jeFsYg4z7Zh8WLo2zfx7OiEqlaF\nXr3MbKNMmdxbn5f4/Xfo3DmO//3vGeAD4Bl69szAqFGaeO40hbGIOGvrVnOK0ooVSV8rW9bsQNG2\nLVSp4v7avERMDLz9NgwZEkdMTA9gMtCPLl1GMn68pSD2AApjEXHGuXPw2mswbpx5iBkvRw7TQw4N\n1baVKeDYMWjeHDZuBHgemIyv76uMHDmM556ztBLMQyiMRcT9vvoKnn4a9u93tfn4mHXCQ4fqFKUU\nEhlpNigzQQzQmaJFA1mxoj+lSztZmVxJvxOJiPv8+adrr+iEQXz//fDbb/DhhwriFGLb8NRT8N13\n0cBcLAtGjKjC7t0KYk90Q2FsWVZTy7L+sixrp2VZSY6UtiyrmGVZayzL+tWyrN8ty2qW8qWKSJq1\ncSO0aQPlyplecbx8+eCLL2DVKqhQwbn6vNCoUTBlSiTwMNCOp5/exMsvQwaNh3qk6/61WJbli5l2\n1wjYD/xsWdZC27a3JbhtIDDbtu0PLcsqBywBiqdCvSLiyWJi4KefYN8+OHAADh40e0V/+23Se598\n0py0pHXCKe6rr2DAgAigLbCUGjU+4L33gpwuS67hRn5HqgbstG17N4BlWbOAVkDCMLaB7JfezwEc\nTMkiRcTDXbgAn31m9ofeu/fa97ZoAa++ao40lBT3yy/QseMFzI/pVZQsOYl1657QjGkPdyNhHAj8\nm+B6P1D9inuGAMsty+oDZAFCUqQ6EfFcZ87AX3/BN9+YGdFHj179Xl9f6NjRHA10zz3uqzGd+ftv\naNoULlxYC6wlb97P+OGHx/D3d7oyuZ6UenrQAZhi2/YYy7JqAtMsy7rHtu24hDdZltUd6A5QrFix\nFPrUIuI2CxbABx+Yc4QPXmUALE8eMyErMNC8FS4M9epB0aLurTWdOXgQGjWyOXrUApqRLdtfrFt3\nJ3nzOl2Z3IgbCeMDQMJ/RUUutSXUDWgKYNv2j5ZlBQB5gSMJb7JteyIwESAoKMi+xZpFxN2iosxJ\nSePGXf2eYsWgXz/o2tUcaShuc/IkhIScZN++1sBAMmVqxLJld1KunNOVyY26kTD+GShpWVYJTAiH\nAR2vuOcfIBiYYllWWSAAuMaYlYikGQcPQrt28MMPidv9/aFkSShTBlq1grAw8PNzpsZ07OxZaNLk\nONu3NwL+wMcngi+/hFq1nK5MbsZ1w9i27RjLsp4GvgF8gU9t295qWdZQYJNt2wuBvsAky7Kex0zm\n6mLbtnq+ImndunVmJ6zDh11tDz1kZkHfeafWyTjszz+hZcsj/P13CLAD+IrPP3+ABx5wujK5WTf0\nL8m27SWY5UoJ215L8P42oHbKliYijrFtGDsWBgxwbVXp42NCuF8/nSrgAebOhS5dTnL+fANgL7CI\nd98N4ZFHnK1Lbo1+rRWRxM6eNc995851teXLB7NmQcOGztUlAERHw8svw5gxYFaShuDv35bJk+vT\nqZPDxcktUxiLiMu2beaEpD//dLXVqAFz5kCRIs7VJQBs2QKPPQa//roP80SwOHfdNY4vv4R773W6\nOrkd2ptaRCAuDt55xxxTmDCIe/c2z40VxI6KiYE33zR/Pb/+uhuoB7ShRQubTZsUxN5APWOR9G7X\nLnj88cRbVmbKBJMmoQeQztu710xU/+knMJO0GgIRPPfcfMaM0RGI3kJhLJJe2TZ89JGZkHXhgqv9\n3nth2jQd3OABtm2DRo3i91fZBgSTIUMsc+asoXXrig5XJylJYSySHh0/biZpLVzoavP1hVdegYED\n0f6Jztu0yWxtefy4ubas/mTJYvP992upWFG7eXgbhbFIerN2LTz6qDlVKV758jB1qnkoKY5buxZa\ntjQT28FsaDZ9+jTKlTtGyZIlHa1NUofCWCQ9OHXKTMRavBgmTzZD1PGeeQbefhsCApyrTy5bssQc\n/RwZCbARf/8xLF06lbp1cwG5HK5OUovCWMQbXbgA338Pq1fDqlXmTOG4uMT35MkDU6aYIw3FI3z9\ntVlZFh0N8AOW1ZR8+fJy553HMQfoibdSGIt4i3/+gZkzYelS+PFHc7jD1TRsaCZpFS7svvrkmubP\nNzuPmiBej2U14447CvPtt6sJDFQQezuFsUhaduaM2Slr2jQzDH21LeEtyzwPDg4203Pvvx+tifEc\nc+dChw5mPTGsxrJacNddxVm/fhWFChVyujxxA4WxSFoTEwPLl5sAXrAALl5M/r5y5UwPODgY6teH\nXHre6ImmTze7asVvAV6sWF4CA6uzYEE4+fPnd7Y4cRuFsUhasW+fOU/4iy/gyJGkr/v4mOB95BFo\n0gQKFnR/jXLDbBtGjTJncRjbKVWqDGvWVKRQodVYOowjXVEYi3i6P/6AkSPN82AzjplYxYrQqRN0\n7KhnwGlEbCw8+yx88EF8y5dAGE8+OYHChZ8EFMTpjcJYxBNFRsKiRfDpp2aty5UKFzbh26mTCWNJ\nMy5cMH91X30V3zILeJRq1arz5JPtHaxMnKQwFvEUtm1mQU+dCrNnm7XBV2rQAF580QxD+/q6vUS5\nPXv2mKVLv/4a3zINy+pC7SxTf7IAACAASURBVNp1WLJkEdmyZXOyPHGQwljEaXFxppv09tvxpwEk\nZlnQurV5uFi9uvvrkxSxZInZ+OzkyfiWffj4dKVBgwYsXLiQLFmyOFmeOExhLOKUqCgzGWvUqMTH\nFsYrUcL89O7cGe6+2/31SYqIjYWhQ2HYMNfKMz8/GDfuDu66awl16tQhU6ZMzhYpjlMYi7jb2bMw\ncaI5Pzjh/tBgDmjo1Am6dIHatU2vWNKsixfN+uEFC1xtOXO+z8CBJXjqqRZAI8dqE8+iMBZxlyNH\n4P33Yfz4pM+Ds2eHnj3NFFtt8uAVzp41TxdWr3a13X33KHbu7M8vv3QEtA2puCiMRVLbnj0wZgx8\n8knSDToKFIDnnjNBnCOHM/VJijt+HJo1g40bXW21ar3BDz8MIiwsjKlTpzpXnHgkhbFIatm+Hd54\nA8LDXdsrxbvrLujf3zwP1mlJXuXgQWjcGLZujW+xadhwMKtXD6NTp0589tln+GomvFxBYSyS0nbv\nhtdfN5OzrjwpqXJlMyu6bVstTfJC//5rtv3etctcW5Z5KrFt2wlKlOjGxx9/rCCWZCmMRVLKrl1m\nZvQnnyTdKSs4GF56yfypSVlead8+E8R79phrX1+b8eOP8NRTBYiLGweAjw7nkKvQd4bI7YiNNYfQ\nPvCAWX708ceJg7hpU/PgcOVKCAlREHupPXvMWRzxQZwhQxyNG/fmjTeqcPToUXx8fBTEck367hC5\nFbGxMGmSefbbsiUsW5b49Xr1YP16c7Zw1arO1ChusWuX2Rht3z5z7ecXS8OG3Vm69EMeeeQR8ubN\n62h9kjYojEVu1po15tlv9+6un8Bger3Nm5vjDdeuhbp1HStRUl9EhJmfd++98M8/ps3fP4Z69R5n\n+fJPGDRoEG+99ZZOX5IbomfGIjdqyxZ47bXEOzgA5MkD3brBU0+ZXbPEq9m22Tq8f39XCIOZFB8a\n+hZTp05j2LBhDBw40LkiJc1RGItcS1QUzJsHEybAt98mfi1zZjMpq29f8754vehoaNcu4YlLxj33\nmE3Vypd/huDgO+jUqZMzBUqapTAWSU5kJIwbZzbrOHw46eudOsGbb0JgoPtrE8c891ziIM6bFwYP\njuS//4ZTseIAsmTJriCWW6JnxiIJ2bbpCZcrZ8YhEwZxhgzQvr2ZHf355wridGb8eDNAEq9HD9iy\nJYJFi1oxfPgwVq5c6VxxkuapZywS7+efTQCvXZu4PTDQ/OR94gntG51OLV9utg2PFxYGo0efp1Wr\nlqxZs4bJkyfTqlUr5wqUNE9hLOmbbZs1wG+9lXhHf4BcucxOWk89Zc68k3Rp+3bznDh+M7Vq1eC9\n987SrFlzvv/+e6ZOnaqhabltCmNJf86fh19+MT3h6dPN+wn5+kLv3jB4MOTO7UyN4riICDMs/eab\ncOaMaStSxEymP3PmMLt372bGjBmEhoY6W6h4BYWxpA//+x9Mm2bGG7duTbpnNJgQDg2FgQOhbFn3\n1ygeISrK7Gj6xhvm0Id4mTNDePh5ChbMjGXdzY4dO8isWfSSQhTG4r0OHTKHNUybZtYIX01AgHke\n3LcvFC/utvLE8+zdCy1aJDxxybjjDhg79hi9ezeiWbNmDB8+XEEsKUphLN7Fts0ErAkTYP78pEcX\ngtkpq1w5s01ltWrmBKX8+d1eqniWX381ZxD/95+rrVAhM1Dy4IOHadYshJ07d/Lmm286V6R4LYWx\npG3nz8OOHfDnn2amzdy55s8rZcoErVvDo4+afaOzZnV/reKxvvkGHn4Yzp0z1/7+MGwYPP00nDp1\nkODgYPbt28eiRYsIDg52tljxSgpjSXtiYkzojhkDmzZd+966daFrV9P7zZbNPfVJmjJ1qnlKEX/Y\nVs6cZpJW/foQHR1NSEgI+/fvZ9myZdSrV8/ZYsVrKYwl7YiIgClTzJnB8WfVJSdrVujcGXr2NPsU\nilxFeDh06eK6LlrUHLRVvry59vPzY+jQoRQuXJhatWo5UqOkDwpjSRtWrYLHHoMDBxK3+/qac4TL\nlIHSpaFCBXOkYfbsztQpaca335rf2eLdey8sWQKFC8OuXbvYunUrLVu25OGHH3auSEk3FMbi2aKj\nzUlJb79tJmfFy50b+vQxD/V0XqzcpO3boVUrs4wJzO9yq1ebb6u//vqLhg0bEhcXR3BwMFmyZHG2\nWEkXFMbiuXbvhg4dzF7Q8fLlg1deMQ/5NAlLbsF//5lZ0ydPmusCBczQdO7csG3bNho2bIht26xc\nuVJBLG6jMBbPNHOm2YYyfusjgMaNzWybggWdq0vStB07zL4ue/ea68yZYfFis7z8999/JyQkBF9f\nX9asWUNZbfwibqRTm8SznDsHjz8OHTu6gjhDBjNpa+lSBbHckshIs814hQrw22+mzccHZs+GKlXM\n9bx58/D392fdunUKYnE7y074HM6NgoKC7E3XW5Yi6csvv5jjcP7+29V2112ml1y1qnN1SZq2dq05\ndGvHDlebjw98/HH8kqYYMmTIgG3bHDt2jHz58jlWq3g3y7I227YdlNxr6hmL87ZuNTOlq1dPHMSd\nOpltkRTEcotmz4bg4MRBXK0abN5sgvj777+nXLly/PXXX1iWpSAWxyiMxRkXL8KaNWYZ0j33wOef\nu3ZdyJrVXH/+uTbqkFu2YoXZcC3+TJDs2WH8ePjhB7jvPli7di1NmjTBsiyyajKgOEwTuMQ9Ll6E\nOXPgu+/M0YVbtrjCN6GGDc344d13u79G8RobN8JDD5mVcWCWLq1cCYGB5nrlypW0bNmSEiVKsHLl\nSgoVKuRcsSIojCW1RUebXbOGDYN//03+Hssy+0YPGGCGqkVuw/btZunS+fPmumhRc3JmfBD/8MMP\ntGjRglKlSrFy5Ury65AQ8QAappbUERsL06eb05G6d08+iEuXNsuXtm2DefMUxHLb9u0zK+COHzfX\nefKYIC5a1HXPvffeS9euXVmzZo2CWDyGesaSsmzb7LI/aFDSQ2Hz54fevaF2bbOeJGdOZ2oUr3Tw\noJmstX+/uc6SxWxvWaaMuV6xYgU1atQgW7ZsTJgwwblCRZKhnrGkDNs259BVqwZt2iQO4ly54M03\nzY5ar71mfmIqiCUFHTlivq127TLX/v5msKVaNXM9c+ZMHnjgAQYNGuRckSLXoJ6x3L69e80e0YsX\nJ27PmhWeew769lX4Sqo5cQIaNTJHWoPZI2bOHDNcDTB16lS6du1K3bp1eeONN5wrVOQaFMZy66Kj\n4d13YcgQuHDB1R4QYIajBwwwe0mLpJKzZ6FJE/j9d3Pt42OmKrRsaa4nT55M9+7dCQ4O5quvviJz\n5szOFStyDQpjuTWrV5te75YtrjbLgiefhMGDzTl0IqnsxRchfiM/y4LPPoP27c31uXPneP3112na\ntCnz5s0jICDAuUJFrkNhLDfnxx/h1VfNhh0JVawIEydqRrS4zfr1Zkl6vA8+cJ1PbNs2WbNm5dtv\nv6VQoUJkzJjRmSJFbpAmcMmN2bIFmjeHWrUSB3HmzOYQh02bFMTiNhcvmkGYeC1bmlVyAG+99RbP\nPvsstm1TvHhxBbGkCQpjubYjR8xPufvuM+tE4vn6mp+Gf/4J/fqBn59zNUq6M2yYa7/pbNlMrxhs\nhg4dyssvv8yxY8eIjY11skSRm6JhaknemTMwaRIMHZr4TGHLMscbDhmiLSvFEf/7H4wc6boeORIC\nA20GDhzIiBEjeOyxx/jkk0/w9fV1rkiRm6QwFiM62jyEW7XKTM7atMnsopVQ48YwZow52EHEATEx\n8ccemuu6dc0Gb6+++ipvvvkmTz75JB999BE+Phr0k7RFYZzeHThgesATJ8KhQ8nfU6oUjB1rNvy1\nLPfWJ3JJXBw884xr9rS/v/nW9fGBmjVr8uyzzzJ27FgFsaRJCuP06pdfYMQIs3Vlcs/WLMs8J+7S\nBXr21DNhcVRcnPk2nDjR1TZwYBynTv0MVOfBBx/kwQcfdKw+kdulME5voqLM7Jc330wawoUKmXPn\ngoOhQQPInduREkUSiouDHj1g8mRXW2hoLLt2PUmtWlP59ddfqVixonMFiqQAhXF68ttv8Nhjru2K\n4tWvb3bMat1aPWDxKDEx5pnwZ5+52jp2jAG6MHXqdAYPHkyFChUcq08kpSiMvd1//5l1wStWwLRp\nrpkvAPXqwfjxoB9m4oE2bTJB/OuvrrZOnaKJiHiEuXPnMHz4cF555RXnChRJQQpjbxQRYfaM/uIL\nc1bwlTJlMsPUffqY2S8iHuTMGRg40KwdjotztXfrBvffP5dHH53D6NGj6du3r3NFiqQwhbE3sW2Y\nPRv694d//kn+npo1YcoUM0NaxMP88AO0a2fOJo4XEGC2O+/fHywrjOLFi1G7dm3nihRJBeoWeYvN\nm6FOHQgLSxzE/v7mmfDQofD99/Dddwpi8UjLlkFISOIgbtIENm2K4PffO7Jt2x9YlqUgFq+knnFa\nFxsLb78Nr72WeHZ03rxm1nTnzmb/aBEPNmcOPPKI2XsGzLfv++9DixbnadnyQdauXUvz5s25RxvO\niJe6oZ6xZVlNLcv6y7KsnZZlvXSVe9pblrXNsqytlmXNSNkyJVkHDphT1V991RXEfn7Qty/s3Gn2\nlFYQi4f75BMzoBMfxMWKmeHq5s3P0qzZA6xbt47PP/+cRx55xNlCRVLRdXvGlmX5Ah8AjYD9wM+W\nZS20bXtbgntKAi8DtW3bPmlZVv7UKlgwz4bnzTOLL48fd7XXrAlTp0LJks7VJnIT3nvPHIsdr0wZ\nM/E/W7bTNGnyABs3bmTGjBmEhoY6V6SIG9xIz7gasNO27d22bUcBs4BWV9zzJPCBbdsnAWzbPpKy\nZQpgliVNnw733gsPP+wKYsuCQYPM3tIKYkkjxoxJHMSVK5tv4SJFwN/fn1y5cjFnzhwFsaQLN/LM\nOBD4N8H1fuDKg2tLAViW9T3gCwyxbXtZilQoZn3H5MlmOdLevYlfCww0S5gaNHCiMpFb8vbb8FKC\nB161a8PixRAdfYyTJ33JlSsXixYtwtJe6JJOpNQErgxASaABUARYb1lWBdu2TyW8ybKs7kB3gGLF\niqXQp/Zyu3bB44/Dt98mbs+SxZwnPHAg5MnjTG0it2D4cPNtG69ePRPE588fJjg4mLx587JmzRoF\nsaQrNzJMfQAomuC6yKW2hPYDC23bjrZtew+wAxPOidi2PdG27SDbtoPy5ct3qzWnD3FxMGGCGZJO\nGMR58phlSv/8A++8oyCWNGXo0MRBfP/9sGQJnDlzkAYNGrBnzx5ee+01BbGkOzcSxj8DJS3LKmFZ\nlj8QBiy84p4FmF4xlmXlxQxb707BOtOXXbvM2cG9e8P586bN19fMmt63zzwf1iEOkobYtll9N3iw\nqy0kBBYtghMn/qV+/frs37+fZcuW0bBhQ+cKFXHIdYepbduOsSzraeAbzPPgT23b3mpZ1lBgk23b\nCy+91tiyrG1ALPCibdvHr/5RJVlRUTBqFLzxBly86GovW9bMkq5a1bnaRG6RbZve8IgRrrbGjc3p\nnZkyQYsWj3HkyBGWL19OzZo1nStUxEGWbduOfOKgoCB7U/wp4eldXJw5zKFPH9i+3dXu42PWDA8d\navYEFEljbBteftlM2Ir3wANmZV78t/Tu3bs5ceIEQUFBzhQp4iaWZW22bTvZb3Rth+mUqCj45htz\nYnrRombMLmEQV64MP/0EI0cqiCXNsW2zTOmBBxIHcfPmMH8+7N37Jy+99BJxcXHceeedCmJJ97Qd\nprv9/Td89JE5oPXkyaSvZ81qhql794YM+uuRtCUuDr7+Gt56CzZsSPxaq1YQHg5///0HISEh2LZN\n7969KVq0aPIfTCQd0U97d7Bts3bj/fdh+fLk78mTB9q0MbNcihRxb30iKeDcOejQwUzKSsiyoGtX\nszhg+/b/ERISgp+fH6tXr1YQi1yiME5tf/8NvXrBypVJXytWDNq2hdatoVYt9YQlzTp8GFq0gITT\nQPz94bHHoF8/c1DY5s2badSoEVmyZGH16tWU1G5xIpfpp39qiYw0z3uHDzfvx7Ms81OrVy8zpdRH\nj+0lbduxA5o2hT17XG29e5uVeIUKudqOHz9OgQIFWLJkCSVKlHB/oSIeTGGcki5cMGcGr14Nc+ea\nk5Pi+fiYAO7bF4oXd6xEkZS0ahWEhrq2SffxgQ8/hO7dXfccO3aMvHnz0rhxY7Zs2UIGjQCJJKFu\nWUrYvNn0cnPlMn++9VbiIA4Kgp9/Ns+MFcTiBXbtMk9YQkJcQZw5M3z1VeIgXrt2LXfeeSdffvkl\ngIJY5CoUxrcjLs4cPVOzpjn3LSoq8evZssG4cWZaaeXKztQokoLOnoUBA6BcObNWOF6+fLB2rXkC\nE2/FihU0a9aMokWLUrt2bbfXKpKW6NfUW3X4MHTpAsuuOJyqXDkIDoaGDc1b9uyOlCeS0iIizOFg\nv/ySuP3RR81gUGCgq23JkiW0adOGMmXKsGLFCrQXvci1KYxv1qlTZmvKN980gRwvKMgcZVi6tHO1\niaSi559PHMQ1asC770L1Kw5U/fvvv2ndujUVK1Zk+fLl5NY+6iLXpTC+Ub/+ahZKzphhJmol1L8/\nDBtm1nKIeKHZs+Hjj13XI0eaJUvJHa5UsmRJPvzwQ9q2bUvOnDndV6RIGqa9qa/l4kUzK3rCBPjx\nx6SvFyxoesmNG7u/NhE32b0bKlWCM2fMdfv2MGtW0iCeNWsWpUqVorLmR4gk61p7U6tnnJy//4ZP\nP4XJk+HYsaSvV6hgFlI+8ojZvlLES0VFmaVL8UF8550wcWLSIJ4yZQpdu3alTZs2zJ071/2FiqRx\nCuN4x46ZjXOnTTMHNFzJzw8eftisFa5dO/nxOREv07+/a1ctPz/TI86RI/E9EydOpEePHjRq1IjP\nP//c/UWKeIH0HcbxR8t88IE5SiYmJuk9RYvCU09Bt25QoID7axRxyLvvwnvvua5Hjkx6pPb48ePp\n06cPzZs3Z+7cuQTohDGRW5I+w/jCBXNq0oQJsG1b0tf9/KBZM3j8cXPmmzYqkHRm2jQzezpe69bw\n7LOJ74mLi+Obb76hdevWhIeH468JjCK3LP2lzNat5nSkHTuSvlajBnTqZB6S5cnj/tpEPMDixeb3\n0Hi1a8P06YmfzERERJApUybmzJmDr68vfn5+7i9UxIukrx24Zs6EatUSB3HWrNCzJ/z+u5kx3auX\ngljSre+/h3btIDbWXFeoYM4nzpzZXNu2zZAhQ6hVqxanT58mICBAQSySAtJHGEdFmTG2jh1da4Qz\nZzYPxQ4cMMPVFSo4W6OIw/75B1q2NDttgdlGfdkys+U6mCB+9dVXef3117nvvvvIqpUEIinG+4ep\nL16EBx9MfJ5wqVJmY93y5Z2rS8SDREdDhw5w4oS5zp8fli+HwoXNtW3b9OvXj7Fjx9KjRw8mTJiA\nj47/FEkx3v2vKSbG9IYTBvFDD5kTlBTEIpcNHgw//GDe9/WFL7+EkiVdr48YMYKxY8fSp08fPvzw\nQwWxSArz3p6xbZslSfPnu9peew2GDNEaYZEEli83W63HGzYM6tRJfE/nzp3x9fVlwIABWPr3I5Li\nvPfX21degU8+cV2/8IKCWOQKhw6ZBQTxGjUyRyQCxMbGMnnyZGJjYylatCgvvfSSglgklXhnGI8b\nZ850i9e5M4wapSAWSSA21hx/eOSIuS5Y0Kwv9vGBmJgYOnfuzJNPPsmiRYucLVQkHfC+YerISBg0\nyHXdooXZY1rPuEQSGTECVq8271uWWUtcoABER0fzyCOPMGfOHEaMGEGrVq2cLVQkHfC+MD550rWr\nfY4c5uw3rYMUSWT9evPUJt7AgdCwIURGRhIWFsaCBQsYM2YML7zwgmM1iqQn3hfG0dGu97NmhUyZ\nnKtFxAMdPWqWMcXFmet69czcRoBt27axfPly3n//fZ5++mnnihRJZ7w7jNUjFkkkLg66dIGDB811\nnjwwYwZYVizgS6VKldi5cyeFChVyskyRdMf7HqQqjEWu6p13YMkS1/Xnn0OOHOdo1KgRH3/8MYCC\nWMQBCmORdCA62qz2e/FFV1u/flCnzhmaNm3KunXryJYtm3MFiqRzGqYW8XK7d5uN6H76ydVWvTq8\n+OIpGjduyubNm5k1axbt2rVzrkiRdE49YxEvFh4OlSolDuJGjWDu3EiaNQvhl19+Ye7cuQpiEYcp\njEW81McfQ1iYa6VfhgwwcqQ5ialIkYx06NCBBQsWaB2xiAfQMLWIF5o61WzNHu+uu8xx3kWL/sfm\nzf9StWpV+vbt61yBIpKIwljEy4SHQ9euruuqVWHFCjh37gD16zfk/Pnz7Ny5k4CAAOeKFJFEFMYi\nXuSrr+CRR1wbetx7rxmWPn36Hxo2bMiRI0dYunSpgljEw+iZsYiXmDUL2rUzB0AAlC1resSnT++h\nXr16HDt2jBUrVlC7dm1nCxWRJBTGImmcbZtDyTp0cH373303rFoF+fLB6NGjOXv2LKtWraJ69erO\nFisiyVIYi6RhsbHwzDPQv7+rrWxZcxpT/EZa77zzDhs2bKBKlSrOFCki16UwFkmjTp2Chx+G8eNd\nbfXqwfffw+nTfxAcHMzRo0fx9/enZMmSzhUqItelMBZJg5Ytg3vugQULXG3t28M338C+fb/RoEED\n/vzzT06ePOlckSJywxTGImnI2bPQvTs88AAcOOBqf+EFs474jz820bBhQzJnzsy6desoVaqUc8WK\nyA3T0iaRNGL7dmjWDPbudbXlywcffght28KmTZsIDg4md+7crFmzhuLFiztVqojcJPWMRdKArVuh\nQYPEQdy2Lfzxh/kToEiRItStW5f169criEXSGIWxiIfbssUE8ZEj5jprVpgxA+bMgfz54bfffiMm\nJoaCBQuyaNEiihYt6mi9InLzFMYiHux//4P774djx8x1tmxmklaHDmBZ8M0331CzZk0GDx7sbKEi\nclsUxiIe6vffoWFDOH7cXGfPDsuXQ61a5nrRokW0bNmSMmXK8PzzzztXqIjcNoWxiAfavRuaNIET\nJ8x1jhxma8saNcz1/PnzadOmDRUrVmTVqlXkzZvXuWJF5LZpNrWIh/nvP2jc2PwJpke8ciUEBZnr\n06dP061bN6pUqcKyZcvIkSOHc8WKSIpQGIt4kNOnoWlT2LXLXAcEwNdfu4IYIEeOHCxfvpzSpUuT\nLVs2ZwoVkRSlYWoRDxERAS1bmklbAL6+5mzievXM9aeffsp7770HQFBQkIJYxIsojEU8wPHj0KgR\nrF/vaps82YQzwEcffUS3bt1YunQpsfFnJIqI11AYizhszx6oXdsc8BBv1Cjo0sW8P27cOHr27Enz\n5s1ZsGABvr6+jtQpIqlHYSzioM2boWZN+Osvc21ZMHYs9OtnrkePHs2zzz7LQw89xLx58wgICHCu\nWBFJNZrAJeKQ1avNMPT58+Y6Y0aYNg3atXPdkzFjRkJDQ5k2bRp++n4W8VrqGYs44Ntv4cEHXUGc\nK5dZR9yuHdi2zd5Lm1D36dOHmTNnKohFvJzCWMTNNm6E5s3hwgVzHRhonhfXrWuC+OWXX+aee+7h\nr0tj15ZlOVitiLiDwljEjX77zeysdfasuS5Y0AxXly1rgrhv3768/fbbdOrUiZIlSzpbrIi4jcJY\nxE22bzfLl06dMtd585qdtUqVgri4OPr06cM777zDM888w4QJE/Dx8b5/niKSPO/7164wFg909Cg0\na+Y6fSlnTnPoQ/ny5nrKlCl88MEH9OvXj3fffVdD0yLpjGZTi6SyyEho0wYuzckiSxZYtgwqVXLd\n07lzZzJnzkxoaKiCWCQdUs9YJBXZNjz1FHz3nbm2LJg5E6pXh5iYGPr378+hQ4fIkCEDYWFhCmKR\ndEphLJKKRo+GKVNc1yNHmiVNUVFRhIWFMWrUKBYvXuxYfSLiGTRMLZJKFiyAAQNc1127Qt++EBkZ\nSfv27Vm4cCFjx47liSeecK5IEfEICmORVLB+PYSFmWFqMCcvffghXLwYQdu2bVm6dCkffPABvXr1\ncrZQEfEIGqYWSWG//WaGoiMjzfVdd8GXX4K/P1y4cIH9+/czadIkBbGIXKaesUgK2rULmjaFM2fM\ndcGCZglTQMA5oqL8yZMnD5s2bcLf39/ZQkXEo6hnLJJCDh2Cxo3h8GFznSMHfPMN5MlzmiZNmvDo\no49i27aCWESSUBiLpIB//oEGDWD3bnMdEACLFkHRoidp3LgxGzdupH379lq6JCLJ0jC1yG366y+z\nzeW//5prX1+YOxfKlj1OcHAj/vjjD+bOnUurVq2cLVREPNYN9Ywty2pqWdZflmXttCzrpWvc19ay\nLNuyrKCUK/EmKYzFjX75xZy2FB/E/v4wezY0a2bTpk0btm3bxldffaUgFpFrum7P2LIsX+ADoBGw\nH/jZsqyFtm1vu+K+bMCzwE+pUegNUxiLm3z/vdlvOn6yVubMZm1xo0YAFiNHjuTs2bOEhIQ4WaaI\npAE30jOuBuy0bXu3bdtRwCwguV/zhwFvAxdTsL6bExvrWthpWWa8UCQV7Nhhli/FB3HOnOYEpnLl\nDjBx4kQAqlevriAWkRtyI2EcCPyb4Hr/pbbLLMuqDBS1bdvZff2iolzvq1csqeTkSRPEJ0+a6/z5\nYd06CAz8h/r169OvXz8OHTrkbJEikqbc9gQuy7J8gLFAlxu4tzvQHaBYsWK3+6mT0hC1pLLoaGjX\nzvSMwcyaXrwYsmbdTb16DTl16hQrVqygUKFCzhYqImnKjfSMDwBFE1wXudQWLxtwD7DWsqy9QA1g\nYXKTuGzbnmjbdpBt20H58uW79aqvRmEsqci24ZlnYNUqV9vnn0OOHH9Tv359zp49y+rVq6levbpz\nRYpImnQjYfwzUNKyrBKWZfkDYcDC+Bdt2z5t23Ze27aL27ZdHNgAtLRte1OqVHwtCmNJRWPHwkcf\nua6HDjW95A0bNhAViMGxQgAAG29JREFUFcWaNWuoXLmycwWKSJp13TC2bTsGeBr4BtgOzLZte6tl\nWUMty2qZ2gXeFIWxpIK4OHj5ZejXz9XWoQO8+KLZfLpTp07s2LGDihUrOlShiKR1N7TO2LbtJbZt\nl7Jt+y7btodfanvNtu2FydzbwJFeMSiMJcVFRkKnTvDWW662WrWgT59fKVWqJOvWrQMgR44cDlUo\nIt7Au7bDVBhLCjp1Ch54AGbMcLW1aAEjRmykWbOGWJZFkSJFnCtQRLyGd22HqTCWFHLhgtm8Y1OC\nMZ4ePaBjxx9o0aIpefPmZc2aNdxxxx3OFSkiXkNhLHKFuDh47LHEQTxiBLRqtZ1q1RpTuHBhVq9e\nrV6xiKQYDVOLXGHwYHPQQ7z33zcTuMqUKc3zzz/PunXrFMQikqK8N4x1Zqzcghkz4I03XNd9+kCZ\nMiv5559/8PHxYdiwYdrQQ0RSnPeGsXrGcpM2bICuXV3XTZpAw4Zf07x5c1544QXnChMRr6cwFgG+\n/dacwBRplg5Ttix07Pgl7dq14d5772XSpEnOFigiXk1hLOnenDlm5nT8wQ958kD37jPp2jWUatWq\nsWLFCnLlyuVskSLi1RTGkq698w6Ehrp6xPnzw5IlscyY8Q61a9dm2bJl2tBDRFKdljZJuhQdDX37\nmpnS8UqXhsWL47jrLl+WLl1KQEAAWbJkca5IEUk31DOWdOfQIQgOThzEtWtDt24TePbZlkRGRpIn\nTx4FsYi4jcJY0pVvv4XKlc2f8dq2hVat3qV//974+HjXPwkRSRs0TC3pgm3DuHFmaDo21rT5+Jg1\nxT4+I+nffwBt27ZlxowZ+GuNuoi4mXd1AxTGkozISHjiCXjuOVcQ580Ly5eDv/8YXnppAGFhYcya\nNUtBLCKOUM9YvNrRo9CmDXz3nautWjWz3WXRopAr1/307t2b9957D19fX+cKFZF0TT1j8VpbtkDV\nqomDuHNnWLfO5s8/VwBQuXJlxo8fryAWEUcpjMUrzZ4NNWvCvn3m+v/t3X+cjXXex/HXxzCspfzM\nbSkkkojcQm4bZvzKz70li7RItyJZbKLftkj2fpRtd0PK3EJkExqWGb8GtcWqtG201pAkNon1m/n1\nvf+4ZnaGjDnMmXOdc+b9fDzm8TjXda7HOW9fY96+3+vMdZnBb34D//d/jgkTxtCpUyeSkpL8DSki\nkk3L1BJV0tPh0Ufht7/N3VeunHcDiG7dshg5ciQzZsxg9OjRdO7c2b+gIiJ5qIwlahw8CH37nr8s\nfcMNsHQpNGyYxbBhw5g9ezaPPvooL7zwAmbmX1gRkTy0TC0R74svYNQoaNDg/CLu1Qs++ggaNYIP\nPviAhIQEnnrqKRWxiIQdzYwlYiUleeeBU1LO31+iBEye7C1X51zDo02bNnzyySc0bdo09EFFRAqg\nmbFEpNdegzvv/GER33ij9/vDEyZARkYaAwYM+PcHtVTEIhKuVMYScZYsgQcfzN2OifF+l3jtWm/J\nOj4ezp07R58+fVi4cCG7du3yL6yISAC0TC0RJSUF+veHrCxvu1kzWLbMu4BHjjNnztC7d2+SkpKY\nPn06w4cP9yesiEiAVMYSMT75xPtQVlqat12/Pqxa5d2DOMfZs2fp0aMH69ev5/XXX2fo0KH+hBUR\nuQwqY4kI+/ZBly5w4oS3/ZOfeOeG8xYxQGxsLPXr12fQoEHce++9oQ8qInIFVMYS9pzzbvTw3Xfe\ndsWKXhHXqpV7zLFjxzh69Ci1a9dm+vTp/gQVEblCKmMJe7NnwxrvUtKYeeeIb7459/mjR4/SuXNn\njh49yvbt23XnJRGJOCpjCWv79sHYsbnbY8fCHXfkbh8+fJhOnTqxfft2Fi9erCIWkYikMpaw5RwM\nG5Z7nrh+fXjuudznDx06RIcOHdi1axfvvvsuXbp08SeoiEghqYwlbCUkQHKy99jM2/7Rj3KfHzdu\nHKmpqaxYsYL4+Hh/QoqIBIEu+iFhaf/+85enR4+G//qv8495+eWXSUlJURGLSMRTGUtYGj8ejh/3\nHterB5MmeY/37t3Lfffdx5kzZ6hQoQItW7b0L6SISJCojCXsbNni3X84x6xZULYs7N69m7Zt27J0\n6VL27NnjX0ARkSBTGUtYcQ7GjMnd7t0b2rWDnTt3cscdd3Dq1CnWr1/PzXl/t0lEJMLpA1wSVhYt\ngg8/9B7Hxnq3SNyxYwdxcXE450hJSaFx48b+hhQRCTLNjCVsnDnjnSvOMWoU1K0LWVlZVKtWjQ0b\nNqiIRSQqaWYsYWPaNO8iHwBVqsCQIftxrgaNGjVi27ZtlCgRXf93FBHJEV0/3VTGEevAAZgyJXd7\nyJAttG7diJdeeglARSwiUS26fsKpjCPSiRPQsyecPOlt16nzZ2bO7EjlypXp06ePv+FEREJAZSy+\nOnsWfvYz+Phjb9tsAwcPdqZ69eps2rSJWnlvzSQiEqVUxuKbzEwYOBDWr8/Z8z2lS/fk+utrsXHj\nRmrUqOFnPBGRkNEHuMQXzsGIEfDOO7n7Jk+uTOPGb9KqVSuqVq3qXzgRkRBTGYsvnnrKu7KW5116\n9IDHHuuFWQ8/Y4mI+CJ6lqkzM73pFni3+ImJ8TeP5Ovll2Hy5JytxZj14V//ehFwPqYSEfFP9JSx\nZsURYf587w5MnoWY9eP221uyYsUKzMzPaCIivlEZS8isXAlDhuRsvQEMpE2bNiQnJ3HVVVf5mExE\nxF/Rc85YZRzWUlKgTx/IyPC2K1f+lEaN4li58l3Kli3rbzgREZ+pjKXILVgAgwfn/BUdo3btq3n/\n/ZeoUiWN0qVL+5xORMR/WqaWIuOcd9ele+7J+euZRokSDZkzZx81apiKWEQkm8pYikRmJjz8cN67\nME0FxtKlS2tat67uYzIRkfCjMpagcw4eeABeeSVnz3PABO66qz/vvruQUvr7ERE5j8pYgm7SJJg9\nO2drNvA0AwcOYtGieZQsGT0fUxARCZbo+cmoMg4Lb7wBTz+du92/f1+aNj3CI4/8SrdBFBHJh8pY\ngmbtWrj/fvCupPUH2rUbwpw55YmNHedzMhGR8BY9UxWVsa/+9je46y7IyMgCRgCj6NlzPrGxficT\nEQl/mhlLoZ05A337wvHjmcAwIIGHHprA6NEP+B1NRCQiaGYshfb44/D3v2cAg4EEHnzwGX7/++d1\nrWkRkQBpZiyFkpICv/0twEFgLb16TWbGjMd9TiUiElmip4zT0nIfq4xD4tgxGDQoHe/b6Fo6dNjO\n0qWV/I4lIhJxtEwtV2zkyLN8/XVvYByVKsHcuZXQyrSIyOVTGcsVWbToDPPn9wJWAPWYOROq6yqX\nIiJXJHqWqVXGIbNz5ykGDuwBbAASGDBgCHff7XMoEZEIpjKWy5Ke7mjZshcZGRuBudSsOZA//MHv\nVCIikU3L1HJZJk40jh0bASwkJmYgCxdCxYp+pxIRiWyaGUtAjhw5wsyZf2HKlC5AbwCefRbatPE3\nl4hINFAZS4EOHz5M+/Yd2b59F859CVSlY0eYMMHvZCIi0UFlLJf07bff0r59PDt37sa5d4GqVKsG\n8+aBbsIkIhIcKmPJ14EDB4iLi2fXrn1kZf0JiMMM3nwTqlXzO52ISPRQGUu+5s1bSGrqfrKykoCf\nAt6lL+Pj/c0lIhJtAlpoNLMuZrbTzFLN7AdnCs1srJntMLPPzGydmdUKftQCqIyDxjlHejp88MFY\nMjM/I6eIX3gBRo3yN5uISDQqsIzNLAZ4BbgTaAj0N7OGFxy2DWjunLsFWAz8JthBC6QyDorU1FSa\nNGnBT3/6BYmJBtQB4JlnYPx4f7OJiESrQJapWwCpzrk9AGb2FtAL2JFzgHMuJc/xm4GBwQwZEJVx\noX322d9p0yaOEyfSgXP/3j9unFfGIiJSNAJZpq4BfJ1ne3/2vvwMBVYVJtQVyVvGsbEhf/tIt2DB\n5zRr1o4TJzKBFKApJUrAk0/C1KnoBhAiIkUoqB/gMrOBQHOgbT7PDwOGAVx33XXBfGvNjAth5sy/\nM3x4e6AUsB5owH/+J8yaBc2a+RxORKQYCGRm/A1wbZ7tmtn7zmNmHYAngJ7OuXMXPg/gnJvlnGvu\nnGtetWrVK8mbP5XxFVm4EEaOvA7oDGykfPkG/O53sGWLilhEJFQCmRlvBeqZWR28Eu4HDMh7gJnd\nCrwKdHHOHQp6ykCojC/bE09s4/nnrweuBuZTpw6sWQN16/qdTESkeClwZuycywBGAsnAF8AfnXPb\nzexZM+uZfdj/AuWAt83sUzNLLLLE+VEZX5YRI97n+efvAB4CoGFDeP99FbGIiB8COmfsnFsJrLxg\n39N5HncIcq7LpzIO2NixG5gxoxve2Yep3HYbrFoFlSv7nUxEpHiKnqsLq4wD8tRTa5g2rStQG9hA\n27Y1WLdORSwi4ieVcTGSlJTOpEkjgHrABm677T9YvhzKl/c7mYhI8aZrUxcTW7dCnz6lgCSgAjfe\nWJmVK1XEIiLhQDPjKJeVBUOHvk3r1qM4dcoBdalZszKrV0OVKn6nExER0Mw4qu3dC926vcmOHb8A\nWgNnqFSpLKtXQ7CvuSIiIldOM+MoNXcuNGgwhx077sW7INoqbr65LCkpcNNNfqcTEZG8VMZRaMYM\nGDTodc6dGwJ0wGwFEyaU4+OP4ZZb/E4nIiIX0jJ1lJkzB0aMAKgO/Df16i1g7twytGrlby4REcmf\nyjiKvPUW3HffF8BNQDdatOjGmjVw1VV+JxMRkUvRMnWUWLYMBgyYgnONgE00bQpJSSpiEZFIoDKO\nAps3O/r0+TXOPQ7056abWrN6NVSs6HcyEREJhJapI9zBg46OHZ8gM3MKMJi6dV9n3boYgn2HShER\nKTqaGUewtDTo0GENJ09OAR6gQoXZJCfHUL2638lERORyaGYcwX75S9ixoyOwHLNuLFpkugWiiEgE\n0sw4AmVlZdG58zhmzvwrYEB3pk41OnXyO5mIiFwJzYwjTGZmJl273s/q1XOAikAT+vWDRx7xOZiI\niFyx6CjjzExwzntsBjEx/uYpIhkZGXTvPojVqxcAE4HHaNIEZs/2/tgiIhKZoqOMi8GsOD09nZ49\n7yE5+W3geeAxataExEQoW9bvdCIiUhgq4wjx1VdZbNhwAngRGEulSujuSyIiUUJlHObOnj3L11+f\noVevipw9uwKI4cc/hpUrdfclEZFooTIOY6dPn6Zbt5/xl78c4/TpPwMlKVUKliyBli39TiciIsGi\nMg5TJ0+epGvXHrz33kYgASiJGcybh36FSUQkyqiMw9Dx48fp0qUrH374ITAfGABAQgL8/Oe+RhMR\nkSKgMg5D99//AJs3bwHeAu4GYPp0GDzYz1QiIlJUouMKXFFUxllZcObMCzi3jJwifuklGD7c31wi\nIlJ0NDMOE9999x0zZszg5MknWbGiFlALgEmTYMwYf7OJiEjRUhmHgX/+85/Ex8eza9ce0tN7A40A\neOghePxxf7OJiEjRUxn77JtvviEuLo6vvtpPRsZKcoq4Z094+WVd5lJEpDhQGfto3759xMXFceDA\nISAZ59oA0KIFLFwYtZfYFhGRC6iMfbR79x4OHTrF2bOrca4VAHXqwPLlut60iEhxojL2wcmTJ4mN\nLcf8+e04cWI34DXvtddCUhJcc42/+UREJLT0q00h9sUXX1C/fgMaN55HQgLkFPHtt8PWrVC/vq/x\nRETEByrjEPr8889p27Ydhw9n8I9/NPv3/l/8AlJSoFo1H8OJiIhvVMYh8umnn9KuXTuOHy9JevpG\n4GYApk6FOXOgdGlf44mIiI90zjgEvv32W+Li4jh3rhznzq0HbgBg2jQYPdrfbCIi4j/NjEPgmmuq\n0aTJs5w+vYmcIn78cRWxiIh4NDMuQu+99x5mZXjjjdvYsGHkv/cPG+Zd5lJERARUxkVm3bp19OjR\nkxIlmnLq1PuAdymt3r29OzDpyloiIpJDy9RFIDk5ma5du5OWdj2nTi0hp4gHDIA339SVtURE5Hwq\n4yBbtmwF3br1JC2tAZmZKUA1SpaE3/8e5s+HMmV8jSciImFIy9RBtHcvDB36BpmZtwDJQCV+8hNY\nvNi7qIeIiMjFaGYcJAsWZNCkCRw5Mh9YC1QiPh4++URFLCIil6YyDoLevedzzz0tOH78e6A0MTFX\nM2UKrF6tq2qJiEjBtExdSH36JLB06f1AO6AMderAggXQqlVIY4iISATTzLgQ+vZ9lXfeGQp0BFbQ\nqdOP2bZNRSwiIpcnOmbGaWm5j0NUxkOGzOXttx8EugGLad++DMuWwY9+FJK3FxGRKKKZ8RV47TWY\nMyceGAUsoXXrMiQmqohFROTKqIwvg3Pw858vY9iwTKAG8DLNm8eyciWUK1dkbysiIlFOZRygtDTH\nrbdO5I9//G8gAYBbb4XkZLj66iJ5SxERKSZUxgE4ftzRoMHj/PWvvwaGAPfRuTNs3AiVKgX97URE\npJhRGRdgxw5HnTq/4ssvXwAeBF5nyJAYli+H8uWD+lYiIlJMqYwvYfFiuO22VI4ceRXvw1rTmTix\nBLNn+34JbBERiSLR8atNQS7j9HQYP94xbZoB9YBPKVPmBl57zRg4sNAvLyIich6V8QXOnoXevTNZ\ntep+4DZgBHXr1mPJErjllkK9tIiIyEVpmTqPM2egR48MVq26F5gDfE+PHvDRRypiEREpOirjbKdP\nQ/fuaaxd2w9YCLzAY489xbJlUKFCUFKKiIhclJapgVOnoHv3LDZsuBtIBF5i4sQxPPNM0BKKiIjk\nq1jPjJ2DFSugWTPYsKEE0Bb4A889pyIWEZHQKbYz4+3bYcwYWLPmNPAPoCkwlilTYMKEoggpIiJy\nccVyZjx5sveBrDVrTgJdgfaUL3+UV19VEYuISOgVu5nx9Onw5JMAx/GKeDPx8fNYuLAiVasWYUYR\nEZF8FKuZ8fLl8PDDAEeBjpht4cUXF7F2bX8VsYiI+Cb6ZsaxsRc9ZOtW6NcPsrIAXsRsG2+99Q59\n+/YMSUQREZH8FIuZ8ZdfQvfu3u8SA9Su/Qx/+tP7KmIREQkLUV/G3sU84NChg0BvKlT4lqSkUtx5\nZ4vQZhQREclH9C1TX1DGjzwCO3bsB+KAAzz//G5uvLFaSOOJiIhcSlTPjBMTYcaMr/Au5vFPxo9P\nZvjw1iGPJyIicimRPzPOzPQupQVgBjExABw8CIMHfwm0A47Tvv1apkzR0rSIiISfyJ8ZX2RWnJUF\ngwfD0aNlgWupWnUdixe3wMyXhCIiIpcU+TPji5Txk09+yerVNYFqwHssWmRUquRLOhERkQIFNDM2\nsy5mttPMUs3sBxeMNLPSZrYo+/ktZlY72EHzlaeMXalSDB36N6ZMaQmMBmD8eKN9+5ClERERuWwF\nlrGZxQCvAHcCDYH+ZtbwgsOGAkedczcA04CpwQ6arzxlvOkkJCS0B2KBX9K8OTz7bMiSiIiIXJFA\nZsYtgFTn3B7nXBrwFtDrgmN6AW9kP14MxJuF6AxtdhlvBXplHAN+DGyka9f6JCfne0EuERGRsBFI\nGdcAvs6zvT9730WPcc5lAMeAysEIWJAli9I5B/QGriIG2MSvf12X5cvReWIREYkIIf0Al5kNA4YB\nXHfddUF5zXq10ykNvA1kWE1OrqxFly5BeWkREZGQCKSMvwGuzbNdM3vfxY7Zb2YlgauB7y98Iefc\nLGAWQPPmzd2VBL5Q4zsqsq3rE+z8OJ0egytRSUUsIiIRJpAy3grUM7M6eKXbDxhwwTGJwCDgQ6AP\nsN45F5SyLVDVqjRdMYlGGQXeylhERCQsFVjGzrkMMxsJJAMxQIJzbruZPQt85JxLBGYD88wsFTiC\nV9ghY6YiFhGRyBXQOWPn3Epg5QX7ns7z+Cxwd3CjiYiIFA+RfzlMERGRCKcyFhER8ZnKWERExGcq\nYxEREZ+pjEVERHymMhYREfGZylhERMRnKmMRERGfqYxFRER8pjIWERHxmcpYRETEZypjERERn6mM\nRUREfKYyFhER8ZnKWERExGfmnPPnjc2+A74K4ktWAQ4H8fWKK41j4WkMC09jWHgaw8IL9hjWcs5V\nvdgTvpVxsJnZR8655n7niHQax8LTGBaexrDwNIaFF8ox1DK1iIiIz1TGIiIiPoumMp7ld4AooXEs\nPI1h4WkMC09jWHghG8OoOWcsIiISqaJpZiwiIhKRIq6MzayLme00s1Qzm3CR50ub2aLs57eYWe3Q\npwxvAYzhWDPbYWafmdk6M6vlR85wVtAY5jnuLjNzZqZPtV5EIONoZn2zvx+3m9mCUGcMdwH8e77O\nzFLMbFv2v+mufuQMV2aWYGaHzOzzfJ43M/td9vh+ZmbNiiSIcy5ivoAYYDdwPRAL/BVoeMExI4CZ\n2Y/7AYv8zh1OXwGOYXugbPbj4RrDyx/D7OPKA5uAzUBzv3OH21eA34v1gG1Axezta/zOHU5fAY7h\nLGB49uOGwF6/c4fTF3AH0Az4PJ/nuwKrAANaAVuKIkekzYxbAKnOuT3OuTTgLaDXBcf0At7IfrwY\niDczC2HGcFfgGDrnUpxzp7M3NwM1Q5wx3AXyfQjwHDAVOBvKcBEkkHH8H+AV59xRAOfcoRBnDHeB\njKEDrsp+fDVwIIT5wp5zbhNw5BKH9ALmOs9moIKZVQ92jkgr4xrA13m292fvu+gxzrkM4BhQOSTp\nIkMgY5jXULz/FUquAscweynrWufcn0IZLMIE8r1YH6hvZn82s81m1iVk6SJDIGM4ERhoZvuBlcDD\noYkWNS73Z+YVKRnsF5ToYWYDgeZAW7+zRBIzKwG8BAz2OUo0KIm3VN0Ob4Vmk5k1ds79y9dUkaU/\nMMc596KZ3Q7MM7NGzrksv4NJrkibGX8DXJtnu2b2voseY2Yl8ZZlvg9JusgQyBhiZh2AJ4Cezrlz\nIcoWKQoaw/JAI2CDme3FO8+UqA9x/UAg34v7gUTnXLpz7kvgH3jlLJ5AxnAo8EcA59yHQBm8ay5L\nYAL6mVlYkVbGW4F6ZlbHzGLxPqCVeMExicCg7Md9gPUu+yy8AAGMoZndCryKV8Q6R/dDlxxD59wx\n51wV51xt51xtvPPuPZ1zH/kTN2wF8u95Gd6sGDOrgrdsvSeUIcNcIGO4D4gHMLOb8Mr4u5CmjGyJ\nwC+yP1XdCjjmnDsY7DeJqGVq51yGmY0EkvE+RZjgnNtuZs8CHznnEoHZeMswqXgn5fv5lzj8BDiG\n/wuUA97O/uzbPudcT99Ch5kAx1AKEOA4JgOdzGwHkAmMc85ppStbgGP4K+A1MxuD92GuwZqg5DKz\nhXj/4auSfV79GaAUgHNuJt559q5AKnAaGFIkOfR3IiIi4q9IW6YWERGJOipjERERn6mMRUREfKYy\nFhER8ZnKWERExGcqYxEREZ+pjEVERHymMhYREfHZ/wPNSlaT2CEdcgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results = pd.concat(dfs)\n", + "import statsmodels.api as sm\n", + "grid = np.linspace(0, 1, 101)\n", + "fig = plt.figure(figsize=(8, 8))\n", + "plt.plot(grid, sm.distributions.ECDF(results['pivot'])(grid), 'b-', linewidth=3, label='Pivot')\n", + "plt.plot(grid, sm.distributions.ECDF(results['pvalue'])(grid), 'r-', linewidth=3, label='P-value')\n", + "plt.plot([0, 1], [0, 1], 'k--')\n", + "plt.legend(fontsize=15);" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-slideshow", + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/algorithms/index.rst b/doc/source/algorithms/index.rst index 245c9e2eb..1f08e567d 100644 --- a/doc/source/algorithms/index.rst +++ b/doc/source/algorithms/index.rst @@ -10,3 +10,5 @@ post-selection inference. covtest.ipynb spacings + LASSO.ipynb + ROSI.ipynb \ No newline at end of file diff --git a/selectinf/algorithms/api.py b/selectinf/algorithms/api.py index cf5391f1c..f15caa897 100644 --- a/selectinf/algorithms/api.py +++ b/selectinf/algorithms/api.py @@ -1,4 +1,5 @@ from .lasso import (lasso, + ROSI, data_carving as data_carving_lasso, additive_noise as additive_noise_lasso) From 93636b1bc4c6b7a5c1160de67b2812f15ff7eb8a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 25 Sep 2019 09:59:25 -0700 Subject: [PATCH 006/187] updating install instructions --- doc/source/download.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/download.rst b/doc/source/download.rst index 6aef2651a..5858ba0ee 100644 --- a/doc/source/download.rst +++ b/doc/source/download.rst @@ -17,13 +17,20 @@ Selection depends on the following Python tools * `Pandas `_ +The package can be installed via pip + + pip install selectinf + +Development +~~~~~~~~~~~ + You can clone the selection repo using:: git clone https://github.com/selective-inference/Python-software.git Then installation is a simple call to python:: - cd selection + cd selectinf git submodule update --init pip install -r requirements.txt python setup.py install --prefix=MYDIR @@ -41,3 +48,10 @@ There is a small but growing suite of tests that be easily checked using `nose < cd tmp nosetests -v selectinf +Building documentation +---------------------- + + cd doc + make html + +To upload a fresh build of the documentation to your :code:`gh-pages` branch, use :code:`make github`. From 6dc72a49fb128261ee1ff55edb5ee60b63708ee2 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 25 Sep 2019 10:03:24 -0700 Subject: [PATCH 007/187] change of title --- doc/source/algorithms/LASSO.Rmd | 2 +- doc/source/algorithms/LASSO.ipynb | 2 +- doc/source/algorithms/ROSI.Rmd | 2 +- doc/source/algorithms/ROSI.ipynb | 2 +- doc/source/algorithms/covtest.ipynb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/algorithms/LASSO.Rmd b/doc/source/algorithms/LASSO.Rmd index 770d31eda..c0ad171f7 100644 --- a/doc/source/algorithms/LASSO.Rmd +++ b/doc/source/algorithms/LASSO.Rmd @@ -14,7 +14,7 @@ jupyter: name: python3 --- -# Conditioning on signs and active set +# LASSO when conditioning on signs and active set One of the first works in this line of conditional inference is [Lee et al.](projecteuclid.org/euclid.aos/1460381681) which diff --git a/doc/source/algorithms/LASSO.ipynb b/doc/source/algorithms/LASSO.ipynb index 7e505805f..8c15520d4 100644 --- a/doc/source/algorithms/LASSO.ipynb +++ b/doc/source/algorithms/LASSO.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Conditioning on signs and active set\n", + "# LASSO when conditioning on signs and active set\n", "\n", "One of the first works in this line of conditional inference\n", "is [Lee et al.](projecteuclid.org/euclid.aos/1460381681) which\n", diff --git a/doc/source/algorithms/ROSI.Rmd b/doc/source/algorithms/ROSI.Rmd index b53c6a8c5..9ed0517e1 100644 --- a/doc/source/algorithms/ROSI.Rmd +++ b/doc/source/algorithms/ROSI.Rmd @@ -14,7 +14,7 @@ jupyter: name: python3 --- -# Conditioning on less: ROSI +# LASSO when conditioning on less: ROSI Instead of conditioning on the active set and signs, one can work in the full model and for each feature $j$ selected diff --git a/doc/source/algorithms/ROSI.ipynb b/doc/source/algorithms/ROSI.ipynb index 0ca401472..11996ef3d 100644 --- a/doc/source/algorithms/ROSI.ipynb +++ b/doc/source/algorithms/ROSI.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Conditioning on less: ROSI\n", + "# LASSO when conditioning on less: ROSI\n", "\n", "Instead of conditioning on the active set and signs, \n", "one can work in the full model and for each feature $j$ selected\n", diff --git a/doc/source/algorithms/covtest.ipynb b/doc/source/algorithms/covtest.ipynb index 13ec59bfe..7a2aa98ed 100644 --- a/doc/source/algorithms/covtest.ipynb +++ b/doc/source/algorithms/covtest.ipynb @@ -416,5 +416,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 2 } From f3c5a36e380ed37a5fed4866b7a4e7a79e3f2d6a Mon Sep 17 00:00:00 2001 From: jonathan-taylor Date: Thu, 21 Nov 2019 12:28:59 -0800 Subject: [PATCH 008/187] updates to python scripts to selectinf --- doc/learning_examples/BH/gbm_targets_BH.py | 57 +++++-- .../BH/gbm_targets_BH_single.py | 119 ++++++++++++++ .../BH/gbm_targets_BH_single_5000.py | 119 ++++++++++++++ .../BH/logit_targets_BH_single_5000.py | 120 ++++++++++++++ .../knockoffs/knockoff_followup.py | 106 +++++++------ .../knockoffs/knockoff_followup_6000.py | 141 +++++++++++++++++ .../knockoffs/knockoff_kernel.py | 1 - .../knockoffs/knockoff_kernel_multi.py | 53 +++++-- .../knockoffs/knockoff_kernel_multi_5000.py | 117 ++++++++++++++ .../knockoffs/knockoff_kernel_multi_8000.py | 117 ++++++++++++++ .../knockoffs/knockoff_kernel_multi_gbm.py | 90 +++++++++++ .../multi_target/lasso_multi.py | 120 ++++++++++++++ ..._example_multi_CV.py => lasso_multi_CV.py} | 12 +- ...lti_bigger.py => lasso_multi_CV_bigger.py} | 46 +++--- .../multi_target/lasso_multi_CV_gbm.py | 84 ++++++++++ .../multi_target/lasso_multi_CV_split.py | 149 ++++++++++++++++++ .../multi_target/lasso_multi_bigger.py | 135 ++++++++++++++++ .../multi_target/lasso_multi_logit.py | 134 ++++++++++++++++ .../multi_target/lee_multi.py | 18 ++- .../stability/stability_selection_harder.py | 12 +- .../stability_selection_harder_5000.py | 102 ++++++++++++ .../stability_selection_harder_big.py | 12 +- 22 files changed, 1735 insertions(+), 129 deletions(-) create mode 100644 doc/learning_examples/BH/gbm_targets_BH_single.py create mode 100644 doc/learning_examples/BH/gbm_targets_BH_single_5000.py create mode 100644 doc/learning_examples/BH/logit_targets_BH_single_5000.py create mode 100644 doc/learning_examples/knockoffs/knockoff_followup_6000.py create mode 100644 doc/learning_examples/knockoffs/knockoff_kernel_multi_5000.py create mode 100644 doc/learning_examples/knockoffs/knockoff_kernel_multi_8000.py create mode 100644 doc/learning_examples/knockoffs/knockoff_kernel_multi_gbm.py create mode 100644 doc/learning_examples/multi_target/lasso_multi.py rename doc/learning_examples/multi_target/{lasso_example_multi_CV.py => lasso_multi_CV.py} (85%) rename doc/learning_examples/multi_target/{lasso_example_multi_bigger.py => lasso_multi_CV_bigger.py} (60%) create mode 100644 doc/learning_examples/multi_target/lasso_multi_CV_gbm.py create mode 100644 doc/learning_examples/multi_target/lasso_multi_CV_split.py create mode 100644 doc/learning_examples/multi_target/lasso_multi_bigger.py create mode 100644 doc/learning_examples/multi_target/lasso_multi_logit.py create mode 100644 doc/learning_examples/stability/stability_selection_harder_5000.py diff --git a/doc/learning_examples/BH/gbm_targets_BH.py b/doc/learning_examples/BH/gbm_targets_BH.py index 7d107c109..f9fd6150b 100644 --- a/doc/learning_examples/BH/gbm_targets_BH.py +++ b/doc/learning_examples/BH/gbm_targets_BH.py @@ -5,11 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import normal_sampler, gbm_fit_sk -from selection.learning.learners import mixture_learner +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, gbm_fit_sk +from selectinf.learning.learners import mixture_learner mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] def BHfilter(pval, q=0.2): @@ -22,9 +22,7 @@ def BHfilter(pval, q=0.2): return np.nonzero(pval <= thresh)[0] return [] -def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): - - # description of statistical problem +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): X, y, truth = gaussian_instance(n=n, p=p, @@ -36,6 +34,23 @@ def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): random_signs=True, scale=False)[:3] + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) @@ -81,19 +96,35 @@ def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): import matplotlib.pyplot as plt import pandas as pd - for i in range(500): - df = simulate(B=40000) - csvfile = 'gbm_targets_BH.csv' + U = np.linspace(0, 1, 101) + plt.clf() + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=20000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] - if df is not None and i > 0: + if df is not None: - try: # concatenate to disk + try: df = pd.concat([df, pd.read_csv(csvfile)]) except FileNotFoundError: pass df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_ax, length_ax = pivot_plot(df, outbase) + f = pivot_plot(df, outbase)[1] + plt.close(f) diff --git a/doc/learning_examples/BH/gbm_targets_BH_single.py b/doc/learning_examples/BH/gbm_targets_BH_single.py new file mode 100644 index 000000000..bc13e149d --- /dev/null +++ b/doc/learning_examples/BH/gbm_targets_BH_single.py @@ -0,0 +1,119 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, gbm_fit_sk +from selectinf.learning.learners import mixture_learner +mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] + +def BHfilter(pval, q=0.2): + pval = np.asarray(pval) + pval_sort = np.sort(pval) + comparison = q * np.arange(1, pval.shape[0] + 1.) / pval.shape[0] + passing = pval_sort < comparison + if passing.sum(): + thresh = comparison[np.nonzero(passing)[0].max()] + return np.nonzero(pval <= thresh)[0] + return [] + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): + + # description of statistical problem + + X, y, truth = generate(n=n, p=p, s=s, signal=signal, sigma=sigma) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): + global counter + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + soln = XTXi.dot(noisy_S) + solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) + pval = ndist.cdf(solnZ) + pval = 2 * np.minimum(pval, 1 - pval) + return set(BHfilter(pval, q=0.2)) + + lam = 4. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':500}, + how_many=1) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + df['R2'] = np.ones(df.shape[0]) * R2mean + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/BH/gbm_targets_BH_single_5000.py b/doc/learning_examples/BH/gbm_targets_BH_single_5000.py new file mode 100644 index 000000000..97891ef2e --- /dev/null +++ b/doc/learning_examples/BH/gbm_targets_BH_single_5000.py @@ -0,0 +1,119 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, gbm_fit_sk +from selectinf.learning.learners import mixture_learner +mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] + +def BHfilter(pval, q=0.2): + pval = np.asarray(pval) + pval_sort = np.sort(pval) + comparison = q * np.arange(1, pval.shape[0] + 1.) / pval.shape[0] + passing = pval_sort < comparison + if passing.sum(): + thresh = comparison[np.nonzero(passing)[0].max()] + return np.nonzero(pval <= thresh)[0] + return [] + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): + + # description of statistical problem + + X, y, truth = generate(n=n, p=p, s=s, signal=signal, sigma=sigma) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): + global counter + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + soln = XTXi.dot(noisy_S) + solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) + pval = ndist.cdf(solnZ) + pval = 2 * np.minimum(pval, 1 - pval) + return set(BHfilter(pval, q=0.2)) + + lam = 4. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':500}, + how_many=1) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=5000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + df['R2'] = np.ones(df.shape[0]) * R2mean + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/BH/logit_targets_BH_single_5000.py b/doc/learning_examples/BH/logit_targets_BH_single_5000.py new file mode 100644 index 000000000..48e9a57d6 --- /dev/null +++ b/doc/learning_examples/BH/logit_targets_BH_single_5000.py @@ -0,0 +1,120 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler +from selectinf.learning.Rfitters import logit_fit +from selectinf.learning.learners import mixture_learner +mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] + +def BHfilter(pval, q=0.2): + pval = np.asarray(pval) + pval_sort = np.sort(pval) + comparison = q * np.arange(1, pval.shape[0] + 1.) / pval.shape[0] + passing = pval_sort < comparison + if passing.sum(): + thresh = comparison[np.nonzero(passing)[0].max()] + return np.nonzero(pval <= thresh)[0] + return [] + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): + + # description of statistical problem + + X, y, truth = generate(n=n, p=p, s=s, signal=signal, sigma=sigma) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): + global counter + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + soln = XTXi.dot(noisy_S) + solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) + pval = ndist.cdf(solnZ) + pval = 2 * np.minimum(pval, 1 - pval) + return set(BHfilter(pval, q=0.2)) + + lam = 4. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=logit_fit, + fit_args={'df':20}, + how_many=1) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=5000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + df['R2'] = np.ones(df.shape[0]) * R2mean + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/knockoffs/knockoff_followup.py b/doc/learning_examples/knockoffs/knockoff_followup.py index 3978af5f3..a19fc6e3c 100644 --- a/doc/learning_examples/knockoffs/knockoff_followup.py +++ b/doc/learning_examples/knockoffs/knockoff_followup.py @@ -5,19 +5,16 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.Rutils import lasso_glmnet -from selection.learning.utils import (full_model_inference, - pivot_plot, - naive_full_model_inference) -from selection.learning.core import split_sampler, keras_fit +from selectinf.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import (full_model_inference, + pivot_plot, + split_full_model_inference) +from selectinf.learning.core import normal_sampler, keras_fit -def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0): +def generate(n=2000, p=100, s=10, signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), sigma=2, **ignored): - # description of statistical problem - - np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, @@ -26,8 +23,24 @@ def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0): sigma=sigma, signal=signal, random_signs=True, - scale=False, - center=False)[:3] + scale=False)[:3] + + return X, y, truth + +def simulate(n=2000, p=100, s=10, signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), + sigma=2, alpha=0.1,B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] dispersion = sigma**2 @@ -35,12 +48,12 @@ def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0): covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) + def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape - idx = np.random.choice(np.arange(n), 200, replace=False) - + idx = np.random.choice(np.arange(n), int(n/2), replace=False) S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X Xidx, yidx = X[idx], y[idx] @@ -66,45 +79,25 @@ def meta_algorithm(X, XTXi, resid, sampler): y, truth, selection_algorithm, - splitting_sampler, + smooth_sampler, success_params=(8, 10), B=B, fit_probability=keras_fit, - fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}, - fit_args={'df':20}) + fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) if df is not None: + idx2 = np.random.choice(np.arange(n), int(n/2), replace=False) observed_set = list(df['variable']) - true_target = truth[observed_set] - - np.random.seed(seed) - X2, _, _ = gaussian_instance(n=n, - p=p, - s=s, - equicorrelated=False, - rho=0.5, - sigma=sigma, - signal=signal, - random_signs=True, - center=False, - scale=False)[:3] - stage_1 = np.random.choice(np.arange(n), 200, replace=False) - stage_2 = sorted(set(range(n)).difference(stage_1)) - X2 = X2[stage_2] - y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0]) - - XTXi_2 = np.linalg.inv(X2.T.dot(X2)) - resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2))) - dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1]) - - naive_df = naive_full_model_inference(X2, - y2, - dispersion_2, + split_df = split_full_model_inference(X, + y, + idx2, + None, # ignored dispersion + truth, observed_set, alpha=alpha) - df = pd.merge(df, naive_df, on='variable') + df = pd.merge(df, split_df, on='variable') return df if __name__ == "__main__": @@ -112,13 +105,27 @@ def meta_algorithm(X, XTXi, resid, sampler): import matplotlib.pyplot as plt import pandas as pd - iseed = int(np.fabs(np.random.standard_normal() * 1000)) - for i in range(500): - df = simulate(seed=i + iseed) - csvfile = 'knockoff_followup.csv' + opts = dict(n=2000, p=100, s=10, + signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), sigma=2, + alpha=0.1, B=3000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '_2000_idx.csv' outbase = csvfile[:-4] - if df is not None and i > 0: + if df is not None: try: df = pd.concat([df, pd.read_csv(csvfile)]) @@ -127,5 +134,6 @@ def meta_algorithm(X, XTXi, resid, sampler): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_plot(df, outbase) + f = pivot_plot(df, outbase)[1] + plt.close(f) diff --git a/doc/learning_examples/knockoffs/knockoff_followup_6000.py b/doc/learning_examples/knockoffs/knockoff_followup_6000.py new file mode 100644 index 000000000..57a8d8649 --- /dev/null +++ b/doc/learning_examples/knockoffs/knockoff_followup_6000.py @@ -0,0 +1,141 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import (full_model_inference, + pivot_plot, + split_full_model_inference) +from selectinf.learning.core import normal_sampler, keras_fit +from selectinf.learning.fitters import gbm_fit_sk + +def generate(n=2000, p=100, s=10, signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=2000, p=100, s=10, signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), + sigma=2, alpha=0.1,B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + + def meta_algorithm(X, XTXi, resid, sampler): + + n, p = X.shape + idx = np.random.choice(np.arange(n), int(n/2), replace=False) + + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + Xidx, yidx = X[idx], y[idx] + rho = 0.8 + + Xnew = rho * Xidx + np.sqrt(1 - rho**2) * np.random.standard_normal(Xidx.shape) + + X_full = np.hstack([Xidx, Xnew]) + beta_full = np.linalg.pinv(X_full).dot(yidx) + winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] + return set(np.nonzero(winners)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + # run selection algorithm + + df = full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(8, 10), + B=B, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':1000} + ) + + if df is not None: + + observed_set = list(df['variable']) + idx2 = np.random.choice(np.arange(n), int(n/2), replace=False) + split_df = split_full_model_inference(X, + y, + idx2, + None, # ignored dispersion + truth, + observed_set, + alpha=alpha) + + df = pd.merge(df, split_df, on='variable') + return df + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=2000, p=100, s=10, + signal=(np.sqrt(2)*0.5, np.sqrt(2)*1), sigma=2, + alpha=0.1, B=6000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '_gbm.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/knockoffs/knockoff_kernel.py b/doc/learning_examples/knockoffs/knockoff_kernel.py index 1ac91d8c7..d979566a9 100644 --- a/doc/learning_examples/knockoffs/knockoff_kernel.py +++ b/doc/learning_examples/knockoffs/knockoff_kernel.py @@ -14,7 +14,6 @@ def simulate(n=1000, p=50, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B= # description of statistical problem - np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, diff --git a/doc/learning_examples/knockoffs/knockoff_kernel_multi.py b/doc/learning_examples/knockoffs/knockoff_kernel_multi.py index a6e438cdd..2fdac03b5 100644 --- a/doc/learning_examples/knockoffs/knockoff_kernel_multi.py +++ b/doc/learning_examples/knockoffs/knockoff_kernel_multi.py @@ -5,16 +5,13 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import normal_sampler, keras_fit +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit -def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000): +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): - # description of statistical problem - - np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, @@ -23,8 +20,23 @@ def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B sigma=sigma, signal=signal, random_signs=True, - scale=False, - center=False)[:3] + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] dispersion = sigma**2 @@ -71,10 +83,23 @@ def meta_algorithm(X, XTXi, resid, sampler): import matplotlib.pyplot as plt import pandas as pd + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + iseed = int(np.fabs(np.random.standard_normal() * 50000)) - for i in range(500): - df = simulate(seed=i + iseed, B=3000) - csvfile = 'knockoff_kernel_multi.csv' + for i in range(2000): + df = simulate(**opts) + csvfile = __file__[:-3] + '_200.csv' outbase = csvfile[:-4] if df is not None and i > 0: @@ -86,6 +111,6 @@ def meta_algorithm(X, XTXi, resid, sampler): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_ax, length_ax = pivot_plot(df, outbase) - + f = pivot_plot(df, outbase)[1] + plt.close(f) diff --git a/doc/learning_examples/knockoffs/knockoff_kernel_multi_5000.py b/doc/learning_examples/knockoffs/knockoff_kernel_multi_5000.py new file mode 100644 index 000000000..031cc0fb5 --- /dev/null +++ b/doc/learning_examples/knockoffs/knockoff_kernel_multi_5000.py @@ -0,0 +1,117 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, sampler): + + n, p = X.shape + + rho = 0.8 + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) + + X_full = np.hstack([X, Xnew]) + beta_full = np.linalg.pinv(X_full).dot(ynew) + winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] + return set(np.nonzero(winners)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(8, 10), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), + sigma=2, alpha=0.1, B=5000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + + iseed = int(np.fabs(np.random.standard_normal() * 50000)) + for i in range(2000): + df = simulate(**opts) + csvfile = __file__[:-3] + '_200.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/knockoffs/knockoff_kernel_multi_8000.py b/doc/learning_examples/knockoffs/knockoff_kernel_multi_8000.py new file mode 100644 index 000000000..8b4035d26 --- /dev/null +++ b/doc/learning_examples/knockoffs/knockoff_kernel_multi_8000.py @@ -0,0 +1,117 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, sampler): + + n, p = X.shape + + rho = 0.8 + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) + + X_full = np.hstack([X, Xnew]) + beta_full = np.linalg.pinv(X_full).dot(ynew) + winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] + return set(np.nonzero(winners)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(8, 10), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + opts = dict(n=2000, p=100, s=10, signal=(0.5, 1), + sigma=2, alpha=0.1, B=8000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + + iseed = int(np.fabs(np.random.standard_normal() * 50000)) + for i in range(2000): + df = simulate(**opts) + csvfile = __file__[:-3] + '_2000.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/knockoffs/knockoff_kernel_multi_gbm.py b/doc/learning_examples/knockoffs/knockoff_kernel_multi_gbm.py new file mode 100644 index 000000000..4f834ec7b --- /dev/null +++ b/doc/learning_examples/knockoffs/knockoff_kernel_multi_gbm.py @@ -0,0 +1,90 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=3000): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False, + center=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, sampler): + + n, p = X.shape + + rho = 0.8 + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) + + X_full = np.hstack([X, Xnew]) + beta_full = np.linalg.pinv(X_full).dot(ynew) + winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] + return set(np.nonzero(winners)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(8, 10), + B=B, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':1000}) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + iseed = int(np.fabs(np.random.standard_normal() * 50000)) + for i in range(2000): + df = simulate(seed=i + iseed, B=3000) + csvfile = 'knockoff_kernel_multi_gbm.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + pivot_plot(df, outbase) + + diff --git a/doc/learning_examples/multi_target/lasso_multi.py b/doc/learning_examples/multi_target/lasso_multi.py new file mode 100644 index 000000000..ba3754c8b --- /dev/null +++ b/doc/learning_examples/multi_target/lasso_multi.py @@ -0,0 +1,120 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk +from selectinf.learning.Rutils import lasso_glmnet + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, lam, sampler): + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) + problem = rr.simple_problem(loss, pen) + soln = problem.solve(max_its=100, tol=1.e-10) + success += soln != 0 + return set(np.nonzero(success)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + lam = 4. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid, lam) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + U = np.linspace(0, 1, 101) + plt.clf() + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/multi_target/lasso_example_multi_CV.py b/doc/learning_examples/multi_target/lasso_multi_CV.py similarity index 85% rename from doc/learning_examples/multi_target/lasso_example_multi_CV.py rename to doc/learning_examples/multi_target/lasso_multi_CV.py index 7daf55c83..14d407608 100644 --- a/doc/learning_examples/multi_target/lasso_example_multi_CV.py +++ b/doc/learning_examples/multi_target/lasso_multi_CV.py @@ -5,11 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import split_sampler, keras_fit -from selection.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, keras_fit +from selectinf.learning.Rutils import lasso_glmnet def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): @@ -48,6 +48,8 @@ def meta_algorithm(X, XTXi, resid, sampler): # run selection algorithm + print('SNR', np.linalg.norm(X.dot(truth)) / np.linalg.norm(y-X.dot(truth))) + print('R2', 1 - np.linalg.norm(y-X.dot(truth))**2 / np.linalg.norm(y)**2) return full_model_inference(X, y, truth, @@ -66,7 +68,7 @@ def meta_algorithm(X, XTXi, resid, sampler): U = np.linspace(0, 1, 101) plt.clf() - for i in range(500): + for i in range(1000): df = simulate() csvfile = 'lasso_multi_CV.csv' outbase = csvfile[:-4] diff --git a/doc/learning_examples/multi_target/lasso_example_multi_bigger.py b/doc/learning_examples/multi_target/lasso_multi_CV_bigger.py similarity index 60% rename from doc/learning_examples/multi_target/lasso_example_multi_bigger.py rename to doc/learning_examples/multi_target/lasso_multi_CV_bigger.py index 19cabbf6c..4f43caa7e 100644 --- a/doc/learning_examples/multi_target/lasso_example_multi_bigger.py +++ b/doc/learning_examples/multi_target/lasso_multi_CV_bigger.py @@ -5,10 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import split_sampler, keras_fit +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk +from selectinf.learning.Rutils import lasso_glmnet def simulate(n=2000, p=1000, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=4000): @@ -29,31 +30,22 @@ def simulate(n=2000, p=1000, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=4000): S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) - splitting_sampler = split_sampler(X * y[:, None], covS) - def meta_algorithm(XTX, XTXi, lam, sampler): + def meta_algorithm(X, XTXi, resid, sampler): - p = XTX.shape[0] - success = np.zeros(p) - - loss = rr.quadratic_loss((p,), Q=XTX) - pen = rr.l1norm(p, lagrange=lam) - - scale = 0. - noisy_S = sampler(scale=scale) - loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) - problem = rr.simple_problem(loss, pen) - soln = problem.solve(max_its=100, tol=1.e-10) - success += soln != 0 - return set(np.nonzero(success)[0]) + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + G = lasso_glmnet(X, ynew, *[None]*4) + select = G.select() + print(select) + return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) - lam = 5. * np.sqrt(n) - selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm @@ -61,11 +53,11 @@ def meta_algorithm(XTX, XTXi, lam, sampler): y, truth, selection_algorithm, - splitting_sampler, + smooth_sampler, success_params=(1, 1), B=B, - fit_probability=logit_fit, - fit_args={'df':20}) + fit_probability=keras_fit, + fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) if __name__ == "__main__": @@ -76,9 +68,9 @@ def meta_algorithm(XTX, XTXi, lam, sampler): U = np.linspace(0, 1, 101) plt.clf() - for i in range(500): - df = simulate(B=4000) - csvfile = 'lasso_multi_bigger.csv' + for i in range(2000): + df = simulate(B=3000) + csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] if df is not None and i > 0: @@ -90,4 +82,4 @@ def meta_algorithm(XTX, XTXi, lam, sampler): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_ax, length_ax = pivot_plot(df, outbase) + pivot_plot(df, outbase) diff --git a/doc/learning_examples/multi_target/lasso_multi_CV_gbm.py b/doc/learning_examples/multi_target/lasso_multi_CV_gbm.py new file mode 100644 index 000000000..73e4f14a8 --- /dev/null +++ b/doc/learning_examples/multi_target/lasso_multi_CV_gbm.py @@ -0,0 +1,84 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, gbm_fit_sk +from selectinf.learning.Rutils import lasso_glmnet + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + splitting_sampler = split_sampler(X * y[:, None], covS) + + def meta_algorithm(X, XTXi, resid, sampler): + + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + G = lasso_glmnet(X, ynew, *[None]*4) + select = G.select() + return set(list(select[0])) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + splitting_sampler, + success_params=(1, 1), + B=B, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':1000}) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + U = np.linspace(0, 1, 101) + plt.clf() + + for i in range(2000): + df = simulate() + csvfile = 'lasso_multi_CV_gbm.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + pivot_plot(df, outbase) + diff --git a/doc/learning_examples/multi_target/lasso_multi_CV_split.py b/doc/learning_examples/multi_target/lasso_multi_CV_split.py new file mode 100644 index 000000000..dfa17a801 --- /dev/null +++ b/doc/learning_examples/multi_target/lasso_multi_CV_split.py @@ -0,0 +1,149 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, keras_fit +from selectinf.learning.Rutils import lasso_glmnet +from rpy2.robjects import numpy2ri +import rpy2.robjects as rpy + +class lasso_glmnet_split(lasso_glmnet): + + def select(self, CV=True, seed=0): + + numpy2ri.activate() + + rpy.r.assign('X', self.X.copy()) + rpy.r.assign('Y', self.Y.copy()) + rpy.r('X = as.matrix(X)') + rpy.r('Y = as.numeric(Y)') + rpy.r('n = nrow(X)') + rpy.r('split_ = sample(1:n, n/2, replace=FALSE)') + rpy.r('Xsplit_ = X[split_,]') + rpy.r('Ysplit_ = Y[split_]') + rpy.r('set.seed(%d)' % seed) + rpy.r('cvG = cv.glmnet(Xsplit_, Ysplit_, intercept=FALSE, standardize=FALSE)') + rpy.r("L1 = cvG[['lambda.min']]") + rpy.r("L2 = cvG[['lambda.1se']]") + if CV: + rpy.r("L = L1") + else: + rpy.r("L = 0.99 * L2") + rpy.r("G = glmnet(X, Y, intercept=FALSE, standardize=FALSE)") + n, p = self.X.shape + L = rpy.r('L') + rpy.r('B = as.numeric(coef(G, s=L, exact=TRUE, x=X, y=Y))[-1]') + B = np.asarray(rpy.r('B')) + selected = (B != 0) + numpy2ri.deactivate() + if selected.sum(): + V = np.nonzero(selected)[0] + return V, V + else: + return [], [] + + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + splitting_sampler = split_sampler(X * y[:, None], covS) + + def meta_algorithm(X, XTXi, resid, sampler): + + S = sampler(scale=0.) # deterministic with scale=0 + ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X + G = lasso_glmnet_split(X, ynew, *[None]*4) + select = G.select() + return set(list(select[0])) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + splitting_sampler, + success_params=(1, 1), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + U = np.linspace(0, 1, 101) + plt.clf() + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/multi_target/lasso_multi_bigger.py b/doc/learning_examples/multi_target/lasso_multi_bigger.py new file mode 100644 index 000000000..e7f86b13d --- /dev/null +++ b/doc/learning_examples/multi_target/lasso_multi_bigger.py @@ -0,0 +1,135 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk +from selectinf.learning.Rutils import lasso_glmnet + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=2000, p=1000, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=4000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, lam, sampler): + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) + problem = rr.simple_problem(loss, pen) + soln = problem.solve(max_its=100, tol=1.e-10) + success += soln != 0 + return set(np.nonzero(success)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + lam = 5. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid, lam) + + # run selection algorithm + + print('SNR', np.linalg.norm(X.dot(truth)) / np.linalg.norm(y-X.dot(truth))) + print('R2', 1 - np.linalg.norm(y-X.dot(truth))**2 / np.linalg.norm(y)**2) + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + U = np.linspace(0, 1, 101) + plt.clf() + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/multi_target/lasso_multi_logit.py b/doc/learning_examples/multi_target/lasso_multi_logit.py new file mode 100644 index 000000000..94f9cd4d1 --- /dev/null +++ b/doc/learning_examples/multi_target/lasso_multi_logit.py @@ -0,0 +1,134 @@ +import functools + +import numpy as np +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk +from selectinf.learning.Rutils import lasso_glmnet +from selectinf.learning.Rfitters import logit_fit + +def generate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, **ignored): + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + return X, y, truth + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): + + # description of statistical problem + + X, y, truth = generate(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.5, + sigma=sigma, + signal=signal, + random_signs=True, + scale=False)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + smooth_sampler = normal_sampler(S, covS) + + def meta_algorithm(X, XTXi, resid, lam, sampler): + p = XTX.shape[0] + success = np.zeros(p) + + loss = rr.quadratic_loss((p,), Q=XTX) + pen = rr.l1norm(p, lagrange=lam) + + scale = 0. + noisy_S = sampler(scale=scale) + loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) + problem = rr.simple_problem(loss, pen) + soln = problem.solve(max_its=100, tol=1.e-10) + success += soln != 0 + return set(np.nonzero(success)[0]) + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + lam = 4. * np.sqrt(n) + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid, lam) + + # run selection algorithm + + return full_model_inference(X, + y, + truth, + selection_algorithm, + smooth_sampler, + success_params=(1, 1), + B=B, + fit_probability=logit_fit, + fit_args={'df':20}) + + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + U = np.linspace(0, 1, 101) + plt.clf() + + opts = dict(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000) + + R2 = [] + for _ in range(100): + + X, y, truth = generate(**opts) + R2.append((np.linalg.norm(y-X.dot(truth))**2, np.linalg.norm(y)**2)) + + R2 = np.array(R2) + R2mean = 1 - np.mean(R2[:,0]) / np.mean(R2[:,1]) + print('R2', R2mean) + + for i in range(5000): + df = simulate(**opts) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None: + + try: + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + f = pivot_plot(df, outbase)[1] + plt.close(f) + diff --git a/doc/learning_examples/multi_target/lee_multi.py b/doc/learning_examples/multi_target/lee_multi.py index d81ff4cb1..2bf5a4eee 100644 --- a/doc/learning_examples/multi_target/lee_multi.py +++ b/doc/learning_examples/multi_target/lee_multi.py @@ -5,12 +5,13 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import (partial_model_inference, - pivot_plot, - lee_inference) -from selection.learning.core import normal_sampler, keras_fit +from selectinf.learning.utils import (partial_model_inference, + pivot_plot, + lee_inference) +from selectinf.learning.core import normal_sampler, keras_fit, gbm_fit_sk +from selectinf.learning.learners import sparse_mixture_learner def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=8000): @@ -63,11 +64,12 @@ def meta_algorithm(XTX, XTXi, lam, sampler): truth, selection_algorithm, smooth_sampler, - fit_probability=keras_fit, - fit_args={'epochs':30, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}, + fit_probability=gbm_fit_sk, + fit_args={'n_estimators':1000}, success_params=(1, 1), B=B, - alpha=alpha) + alpha=alpha, + learner_klass=sparse_mixture_learner) lee_df = lee_inference(X, y, diff --git a/doc/learning_examples/stability/stability_selection_harder.py b/doc/learning_examples/stability/stability_selection_harder.py index 2ac1a1903..f13a9006a 100644 --- a/doc/learning_examples/stability/stability_selection_harder.py +++ b/doc/learning_examples/stability/stability_selection_harder.py @@ -5,11 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import split_sampler, keras_fit +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, keras_fit from sklearn.linear_model import lasso_path @@ -83,9 +83,9 @@ def _alpha_grid(X, y, center, XTX): import matplotlib.pyplot as plt import pandas as pd - for i in range(500): + for i in range(2000): df = simulate(B=3000) - csvfile = 'stability_selection_harder.csv' + csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] if df is not None and i > 0: @@ -97,6 +97,6 @@ def _alpha_grid(X, y, center, XTX): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_ax, length_ax = pivot_plot(df, outbase) + pivot_plot(df, outbase) diff --git a/doc/learning_examples/stability/stability_selection_harder_5000.py b/doc/learning_examples/stability/stability_selection_harder_5000.py new file mode 100644 index 000000000..33943a72f --- /dev/null +++ b/doc/learning_examples/stability/stability_selection_harder_5000.py @@ -0,0 +1,102 @@ +import functools, uuid + +import numpy as np, pandas as pd +from scipy.stats import norm as ndist + +import regreg.api as rr + +from selectinf.tests.instance import gaussian_instance + + +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, keras_fit + +from sklearn.linear_model import lasso_path + +def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): + + # description of statistical problem + + X, y, truth = gaussian_instance(n=n, + p=p, + s=s, + equicorrelated=False, + rho=0.1, + sigma=sigma, + signal=signal, + random_signs=True, + scale=True)[:3] + + dispersion = sigma**2 + + S = X.T.dot(y) + covS = dispersion * X.T.dot(X) + splitting_sampler = split_sampler(X * y[:, None], covS) + + def meta_algorithm(XTX, XTXi, sampler): + + min_success = 6 + ntries = 10 + + def _alpha_grid(X, y, center, XTX): + n, p = X.shape + alphas, coefs, _ = lasso_path(X, y, Xy=center, precompute=XTX) + nselected = np.count_nonzero(coefs, axis=0) + return alphas[nselected < np.sqrt(0.8 * p)] + + alpha_grid = _alpha_grid(X, y, sampler(scale=0.), XTX) + success = np.zeros((p, alpha_grid.shape[0])) + + for _ in range(ntries): + scale = 1. # corresponds to sub-samples of 50% + noisy_S = sampler(scale=scale) + _, coefs, _ = lasso_path(X, y, Xy = noisy_S, precompute=XTX, alphas=alpha_grid) + success += np.abs(np.sign(coefs)) + + selected = np.apply_along_axis(lambda row: any(x>min_success for x in row), 1, success) + vars = set(np.nonzero(selected)[0]) + return vars + + XTX = X.T.dot(X) + XTXi = np.linalg.inv(XTX) + resid = y - X.dot(XTXi.dot(X.T.dot(y))) + dispersion = np.linalg.norm(resid)**2 / (n-p) + + selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi) + + # run selection algorithm + + + return full_model_inference(X, + y, + truth, + selection_algorithm, + splitting_sampler, + success_params=(1, 1), + B=B, + fit_probability=keras_fit, + fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) + + +if __name__ == "__main__": + import statsmodels.api as sm + import matplotlib.pyplot as plt + import pandas as pd + + for i in range(2000): + df = simulate(B=5000) + csvfile = __file__[:-3] + '.csv' + outbase = csvfile[:-4] + + if df is not None and i > 0: + + try: # concatenate to disk + df = pd.concat([df, pd.read_csv(csvfile)]) + except FileNotFoundError: + pass + df.to_csv(csvfile, index=False) + + if len(df['pivot']) > 0: + pivot_plot(df, outbase) + + diff --git a/doc/learning_examples/stability/stability_selection_harder_big.py b/doc/learning_examples/stability/stability_selection_harder_big.py index e22389e6a..9fd38d909 100644 --- a/doc/learning_examples/stability/stability_selection_harder_big.py +++ b/doc/learning_examples/stability/stability_selection_harder_big.py @@ -5,11 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import split_sampler, keras_fit +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, keras_fit from sklearn.linear_model import lasso_path @@ -83,9 +83,9 @@ def _alpha_grid(X, y, center, XTX): import matplotlib.pyplot as plt import pandas as pd - for i in range(500): + for i in range(2000): df = simulate(B=3000) - csvfile = 'stability_selection_harder_big.csv' + csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] if df is not None and i > 0: @@ -97,6 +97,6 @@ def _alpha_grid(X, y, center, XTX): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_ax, length_ax = pivot_plot(df, outbase) + pivot_plot(df, outbase) From c5052da67537fdf911248e2ee5d9c3ad3f98c3e4 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 21 Feb 2020 23:55:27 -0800 Subject: [PATCH 009/187] C code for update for Cox partial likelihood --- C-software | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C-software b/C-software index 851279ffb..84de59b94 160000 --- a/C-software +++ b/C-software @@ -1 +1 @@ -Subproject commit 851279ffb326b145d00af45b87e7d857e3941ec9 +Subproject commit 84de59b94ecdb10805fa4f947abfacc8ca1bf6bf From 32502b38606e36149b8c4ba656e8f28044d8ed6a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 22 Feb 2020 11:03:53 -0800 Subject: [PATCH 010/187] updated C software for cox, wrapper --- C-software | 2 +- .../multi_target/followup_multi.py | 54 ++--- selectinf/algorithms/cox_utils.pyx | 213 ++++++++++++++++++ 3 files changed, 233 insertions(+), 36 deletions(-) create mode 100644 selectinf/algorithms/cox_utils.pyx diff --git a/C-software b/C-software index 84de59b94..1307f8ce0 160000 --- a/C-software +++ b/C-software @@ -1 +1 @@ -Subproject commit 84de59b94ecdb10805fa4f947abfacc8ca1bf6bf +Subproject commit 1307f8ce09995d99f1d1e2ecaba8e1eaef201b17 diff --git a/doc/learning_examples/multi_target/followup_multi.py b/doc/learning_examples/multi_target/followup_multi.py index aa16ded9f..0b506dae7 100644 --- a/doc/learning_examples/multi_target/followup_multi.py +++ b/doc/learning_examples/multi_target/followup_multi.py @@ -5,11 +5,13 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot, naive_full_model_inference -from selection.learning.core import normal_sampler, keras_fit -from selection.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import (full_model_inference, + pivot_plot, + split_full_model_inference) +from selectinf.learning.core import normal_sampler, keras_fit +from selectinf.learning.Rutils import lasso_glmnet def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=2000): @@ -33,10 +35,11 @@ def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B= covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) - def meta_algorithm(X, XTXi, resid, sampler): + idx = np.random.choice(np.arange(n), int(n)/2, replace=False) + + def meta_algorithm(X, XTXi, resid, idx, sampler): n, p = X.shape - idx = np.random.choice(np.arange(n), 200, replace=False) S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X @@ -50,7 +53,7 @@ def meta_algorithm(X, XTXi, resid, sampler): resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) - selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) + selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid, idx) # run selection algorithm @@ -68,35 +71,15 @@ def meta_algorithm(X, XTXi, resid, sampler): if df is not None: observed_set = list(df['variable']) - true_target = truth[observed_set] - - np.random.seed(seed) - X2, _, _ = gaussian_instance(n=n, - p=p, - s=s, - equicorrelated=False, - rho=0.5, - sigma=sigma, - signal=signal, - random_signs=True, - center=False, - scale=False)[:3] - stage_1 = np.random.choice(np.arange(n), 200, replace=False) - stage_2 = sorted(set(range(n)).difference(stage_1)) - X2 = X2[stage_2] - y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0]) - - XTXi_2 = np.linalg.inv(X2.T.dot(X2)) - resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2))) - dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1]) - - naive_df = naive_full_model_inference(X2, - y2, - dispersion_2, + split_df = split_full_model_inference(X, + y, + idx, + dispersion, + truth, observed_set, alpha=alpha) - df = pd.merge(df, naive_df, on='variable') + df = pd.merge(df, split_df, on='variable') return df if __name__ == "__main__": @@ -107,7 +90,7 @@ def meta_algorithm(X, XTXi, resid, sampler): iseed = int(np.fabs(np.random.standard_normal() * 1000)) for i in range(500): df = simulate(seed=i+iseed, B=2000) - csvfile = 'followup_multi.csv' + csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] if df is not None and i > 0: @@ -119,6 +102,7 @@ def meta_algorithm(X, XTXi, resid, sampler): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - pivot_plot(df, outbase) + f = pivot_plot(df, outbase) + f.close() diff --git a/selectinf/algorithms/cox_utils.pyx b/selectinf/algorithms/cox_utils.pyx new file mode 100644 index 000000000..317e87291 --- /dev/null +++ b/selectinf/algorithms/cox_utils.pyx @@ -0,0 +1,213 @@ +import warnings +import numpy as np, cython +cimport numpy as cnp + +DTYPE_float = np.float +ctypedef cnp.float_t DTYPE_float_t +DTYPE_int = np.int +ctypedef cnp.int_t DTYPE_int_t +ctypedef cnp.intp_t DTYPE_intp_t + +cdef extern from "cox_fns.h": + + void _update_cox_exp(double *linear_pred_ptr, # Linear term in objective + double *exp_accum_ptr, # inner accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long ncase # how many subjects / times + ); + + void _update_cox_expZ(double *linear_pred_ptr, # Linear term in objective + double *right_vector_ptr, # Linear term in objective + double *expZ_accum_ptr, # inner accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long ncase # how many subjects / times + ); + + void _update_outer_1st(double *linear_pred_ptr, # Linear term in objective + double *exp_accum_ptr, # inner accumulation vector + double *outer_accum_1st_ptr, # outer accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long ncase # how many subjects / times + ); + + void _update_outer_2nd(double *linear_pred_ptr, # Linear term in objective + double *exp_accum_ptr, # inner accumulation vector Ze^{\eta} + double *expZ_accum_ptr, # inner accumulation vector e^{\eta} + double *outer_accum_2nd_ptr, # outer accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long ncase # how many subjects / times + ); + + double _cox_objective(double *linear_pred_ptr, # Linear term in objective + double *inner_accum_ptr, # inner accumulation vector + double *outer_accum_1st_ptr, # outer accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long *rankmax_ptr, # 0-based ranking with max tie breaking + long ncase # how many subjects / times + ); + + void _cox_gradient(double *gradient_ptr, # Where gradient is stored + double *linear_pred_ptr, # Linear term in objective + double *outer_accum_1st_ptr, # outer accumulation vector + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmin_ptr, # 0-based ranking with min tie breaking + long *rankmax_ptr, # 0-based ranking with max tie breaking + long ncase # how many subjects / times + ); + + void _cox_hessian(double *hessian_ptr, # Where hessian is stored + double *linear_pred_ptr, # Linear term in objective + double *outer_accum_1st_ptr, # outer accumulation vector used in outer prod "mean" + double *outer_accum_2nd_ptr, # outer accumulation vector used in "2nd" moment + long *censoring_ptr, # censoring indicator + long *ordering_ptr, # 0-based ordering of times + long *rankmax_ptr, # 0-based ranking with max tie breaking + long ncase # how many subjects / times + ); + +def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, + cnp.ndarray[DTYPE_int_t, ndim=1] censoring, + cnp.ndarray[DTYPE_int_t, ndim=1] ordering, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmax, + long ncase): + + _update_cox_exp(linear_pred.data, + exp_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _update_outer_1st(linear_pred.data, + exp_accum.data, + outer_1st_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + return _cox_objective(linear_pred.data, + exp_accum.data, + outer_1st_accum.data, + censoring.data, + ordering.data, + rankmin.data, + rankmax.data, + ncase) + +def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, + cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, + cnp.ndarray[DTYPE_int_t, ndim=1] censoring, + cnp.ndarray[DTYPE_int_t, ndim=1] ordering, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmax, + long ncase): + """ + Compute Cox partial likelihood gradient in place. + """ + + # this computes e^{\eta} and stores cumsum at rankmin + + _update_cox_exp(linear_pred.data, + exp_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _update_outer_1st(linear_pred.data, + exp_accum.data, + outer_1st_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _cox_gradient(gradient.data, + linear_pred.data, + outer_1st_accum.data, + censoring.data, + ordering.data, + rankmin.data, + rankmax.data, + ncase) + + return gradient + +def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, + cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, + cnp.ndarray[DTYPE_float_t, ndim=1] right_vector, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] expZ_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] outer_2nd_accum, + cnp.ndarray[DTYPE_int_t, ndim=1] censoring, + cnp.ndarray[DTYPE_int_t, ndim=1] ordering, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, + cnp.ndarray[DTYPE_int_t, ndim=1] rankmax, + long ncase): + """ + Compute Cox partial likelihood gradient in place. + """ + + # this computes e^{\eta} and stores cumsum at rankmin, stored in outer_accum_1st + + _update_cox_exp(linear_pred.data, + exp_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _update_outer_1st(linear_pred.data, + exp_accum.data, + outer_1st_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _update_cox_expZ(linear_pred.data, + right_vector.data, + exp_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _update_outer_2nd(linear_pred.data, + exp_accum.data, + expZ_accum.data, + outer_2nd_accum.data, + censoring.data, + ordering.data, + rankmin.data, + ncase) + + _cox_hessian(hessian.data, + linear_pred.data, + outer_1st_accum.data, + outer_2nd_accum.data, + censoring.data, + ordering.data, + rankmax.data, + ncase) + + return hessian + From 3a41a6b3cf5c598d71addfafce527816177f0899 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 22 Feb 2020 11:06:55 -0800 Subject: [PATCH 011/187] minor fixes, added Cox code --- .../cross_inference/cross_inference.py | 4 +-- .../knockoffs/knockoff_followup.py | 33 ++++++++++--------- .../multi_target/followup_multi.py | 12 +++---- selectinf/info.py | 2 +- selectinf/learning/fitters.py | 17 ++++++---- selectinf/learning/utils.py | 2 +- setup.py | 9 ++++- 7 files changed, 44 insertions(+), 35 deletions(-) diff --git a/doc/learning_examples/cross_inference/cross_inference.py b/doc/learning_examples/cross_inference/cross_inference.py index 9383e69ee..90000e99e 100644 --- a/doc/learning_examples/cross_inference/cross_inference.py +++ b/doc/learning_examples/cross_inference/cross_inference.py @@ -1,7 +1,7 @@ import numpy as np -from selection.learning.core import cross_inference -from selection.learning.keras_fit import keras_fit +from selectinf.learning.core import cross_inference +from selectinf.learning.core import keras_fit data = np.load('lasso_multi_learning.npz') learning_data = (data['T'][:2000], data['Y'][:2000]) diff --git a/doc/learning_examples/knockoffs/knockoff_followup.py b/doc/learning_examples/knockoffs/knockoff_followup.py index 3978af5f3..9bbe1093c 100644 --- a/doc/learning_examples/knockoffs/knockoff_followup.py +++ b/doc/learning_examples/knockoffs/knockoff_followup.py @@ -5,15 +5,15 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.Rutils import lasso_glmnet -from selection.learning.utils import (full_model_inference, - pivot_plot, - naive_full_model_inference) -from selection.learning.core import split_sampler, keras_fit +from selectinf.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import (full_model_inference, + pivot_plot, + split_full_model_inference) +from selectinf.learning.core import normal_sampler, keras_fit -def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0): +def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=2000): # description of statistical problem @@ -39,7 +39,7 @@ def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape - idx = np.random.choice(np.arange(n), 200, replace=False) + idx = np.random.choice(np.arange(n), int(n/2), replace=False) S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X @@ -66,12 +66,11 @@ def meta_algorithm(X, XTXi, resid, sampler): y, truth, selection_algorithm, - splitting_sampler, + smooth_sampler, success_params=(8, 10), B=B, fit_probability=keras_fit, - fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}, - fit_args={'df':20}) + fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) if df is not None: @@ -89,7 +88,7 @@ def meta_algorithm(X, XTXi, resid, sampler): random_signs=True, center=False, scale=False)[:3] - stage_1 = np.random.choice(np.arange(n), 200, replace=False) + stage_1 = np.random.choice(np.arange(n), int(n/2), replace=False) stage_2 = sorted(set(range(n)).difference(stage_1)) X2 = X2[stage_2] y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0]) @@ -98,13 +97,15 @@ def meta_algorithm(X, XTXi, resid, sampler): resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2))) dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1]) - naive_df = naive_full_model_inference(X2, + split_df = split_full_model_inference(X2, y2, + stage_1, dispersion_2, + truth, observed_set, alpha=alpha) - df = pd.merge(df, naive_df, on='variable') + df = pd.merge(df, split_df, on='variable') return df if __name__ == "__main__": @@ -113,8 +114,8 @@ def meta_algorithm(X, XTXi, resid, sampler): import pandas as pd iseed = int(np.fabs(np.random.standard_normal() * 1000)) - for i in range(500): - df = simulate(seed=i + iseed) + for i in range(5000): + df = simulate(seed=i + iseed, B=3000) csvfile = 'knockoff_followup.csv' outbase = csvfile[:-4] diff --git a/doc/learning_examples/multi_target/followup_multi.py b/doc/learning_examples/multi_target/followup_multi.py index 0b506dae7..95fe1208c 100644 --- a/doc/learning_examples/multi_target/followup_multi.py +++ b/doc/learning_examples/multi_target/followup_multi.py @@ -13,11 +13,10 @@ from selectinf.learning.core import normal_sampler, keras_fit from selectinf.learning.Rutils import lasso_glmnet -def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=2000): +def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): # description of statistical problem - np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, @@ -35,7 +34,7 @@ def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B= covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) - idx = np.random.choice(np.arange(n), int(n)/2, replace=False) + idx = np.random.choice(np.arange(n), int(n/2), replace=False) def meta_algorithm(X, XTXi, resid, idx, sampler): @@ -87,9 +86,8 @@ def meta_algorithm(X, XTXi, resid, idx, sampler): import matplotlib.pyplot as plt import pandas as pd - iseed = int(np.fabs(np.random.standard_normal() * 1000)) for i in range(500): - df = simulate(seed=i+iseed, B=2000) + df = simulate(B=3000) csvfile = __file__[:-3] + '.csv' outbase = csvfile[:-4] @@ -102,7 +100,7 @@ def meta_algorithm(X, XTXi, resid, idx, sampler): df.to_csv(csvfile, index=False) if len(df['pivot']) > 0: - f = pivot_plot(df, outbase) - f.close() + f = pivot_plot(df, outbase)[1] + plt.close(f) diff --git a/selectinf/info.py b/selectinf/info.py index b228f8e56..1df639924 100644 --- a/selectinf/info.py +++ b/selectinf/info.py @@ -43,7 +43,7 @@ # versions NUMPY_MIN_VERSION='1.7.1' SCIPY_MIN_VERSION = '0.9' -CYTHON_MIN_VERSION = '0.21' +CYTHON_MIN_VERSION = '0.29.5' MPMATH_MIN_VERSION = "0.18" PYINTER_MIN_VERSION = "0.1.6" SKLEARN_MIN_VERSION = '0.19' diff --git a/selectinf/learning/fitters.py b/selectinf/learning/fitters.py index c6edb396c..30bcf8e86 100644 --- a/selectinf/learning/fitters.py +++ b/selectinf/learning/fitters.py @@ -9,13 +9,16 @@ def gbm_fit_sk(T, Y, **params): fitfns = [] for j in range(Y.shape[1]): y = Y[:,j].astype(np.int) - clf = ensemble.GradientBoostingClassifier(**params) - clf.fit(T, y) - - def fit_fn(clf, t): - return clf.predict_proba(t)[:,1] - - fitfns.append(functools.partial(fit_fn, clf)) + if len(np.unique(y)) > 1: + clf = ensemble.GradientBoostingClassifier(**params) + clf.fit(T, y) + + def fit_fn(clf, t): + return clf.predict_proba(t)[:,1] + fit_fn = functools.partial(fit_fn, clf) + else: + fit_fn = lambda t: np.atleast_1d(np.ones(t.shape[0])) + fitfns.append(fit_fn) return fitfns diff --git a/selectinf/learning/utils.py b/selectinf/learning/utils.py index 4eeb77b77..d68bc5b6a 100644 --- a/selectinf/learning/utils.py +++ b/selectinf/learning/utils.py @@ -49,7 +49,7 @@ def full_model_inference(X, if how_many is None: how_many = len(observed_list) - observed_list = observed_list[:how_many] + observed_list = list(np.random.choice(observed_list, how_many, replace=False)) # find the target, based on the observed outcome diff --git a/setup.py b/setup.py index cf882987b..4b6a011f0 100755 --- a/setup.py +++ b/setup.py @@ -59,9 +59,16 @@ libraries=[], include_dirs=['C-software/src'])) +EXTS.append(Extension('selectinf.algorithms.cox_utils', + ['selectinf/algorithms/cox_utils.pyx', + 'C-software/src/cox_fns.c'], + libraries=[], + include_dirs=['C-software/src'])) + EXTS.append(Extension('selectinf.randomized.selective_MLE_utils', ['selectinf/randomized/selective_MLE_utils.pyx', - 'C-software/src/selective_mle.c'], + 'C-software/src/selective_mle.c', + 'C-software/src/cox_fns.c'], libraries=[], include_dirs=['C-software/src'])) From 5a89aa3e43764ee958d2ac896c0b9b03ebd76b3a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 22 Feb 2020 13:40:49 -0800 Subject: [PATCH 012/187] updating C, cleanup cox --- C-software | 2 +- selectinf/algorithms/cox_utils.pyx | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/C-software b/C-software index 1307f8ce0..3f5d8f344 160000 --- a/C-software +++ b/C-software @@ -1 +1 @@ -Subproject commit 1307f8ce09995d99f1d1e2ecaba8e1eaef201b17 +Subproject commit 3f5d8f3447ebf4670c6f12cc0bfb970b1e1872d5 diff --git a/selectinf/algorithms/cox_utils.pyx b/selectinf/algorithms/cox_utils.pyx index 317e87291..63f6c2856 100644 --- a/selectinf/algorithms/cox_utils.pyx +++ b/selectinf/algorithms/cox_utils.pyx @@ -11,6 +11,7 @@ ctypedef cnp.intp_t DTYPE_intp_t cdef extern from "cox_fns.h": void _update_cox_exp(double *linear_pred_ptr, # Linear term in objective + double *exp_ptr, # stores exp(eta) double *exp_accum_ptr, # inner accumulation vector long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times @@ -20,6 +21,7 @@ cdef extern from "cox_fns.h": void _update_cox_expZ(double *linear_pred_ptr, # Linear term in objective double *right_vector_ptr, # Linear term in objective + double *exp_ptr, # stores exp(eta) double *expZ_accum_ptr, # inner accumulation vector long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times @@ -57,7 +59,7 @@ cdef extern from "cox_fns.h": ); void _cox_gradient(double *gradient_ptr, # Where gradient is stored - double *linear_pred_ptr, # Linear term in objective + double *exp_ptr, # stores exp(eta) double *outer_accum_1st_ptr, # outer accumulation vector long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times @@ -67,7 +69,8 @@ cdef extern from "cox_fns.h": ); void _cox_hessian(double *hessian_ptr, # Where hessian is stored - double *linear_pred_ptr, # Linear term in objective + double *exp_ptr, # stores exp(eta) + double *right_vector_ptr, # Right vector in Hessian double *outer_accum_1st_ptr, # outer accumulation vector used in outer prod "mean" double *outer_accum_2nd_ptr, # outer accumulation vector used in "2nd" moment long *censoring_ptr, # censoring indicator @@ -77,6 +80,7 @@ cdef extern from "cox_fns.h": ); def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_buffer, cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, cnp.ndarray[DTYPE_int_t, ndim=1] censoring, @@ -86,6 +90,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, long ncase): _update_cox_exp(linear_pred.data, + exp_buffer.data, exp_accum.data, censoring.data, ordering.data, @@ -111,6 +116,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_buffer, cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, cnp.ndarray[DTYPE_int_t, ndim=1] censoring, @@ -125,6 +131,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, # this computes e^{\eta} and stores cumsum at rankmin _update_cox_exp(linear_pred.data, + exp_buffer.data, exp_accum.data, censoring.data, ordering.data, @@ -140,7 +147,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, ncase) _cox_gradient(gradient.data, - linear_pred.data, + exp_buffer.data, outer_1st_accum.data, censoring.data, ordering.data, @@ -153,6 +160,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, cnp.ndarray[DTYPE_float_t, ndim=1] right_vector, + cnp.ndarray[DTYPE_float_t, ndim=1] exp_buffer, cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, cnp.ndarray[DTYPE_float_t, ndim=1] expZ_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, @@ -169,6 +177,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, # this computes e^{\eta} and stores cumsum at rankmin, stored in outer_accum_1st _update_cox_exp(linear_pred.data, + exp_buffer.data, exp_accum.data, censoring.data, ordering.data, @@ -185,7 +194,8 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, _update_cox_expZ(linear_pred.data, right_vector.data, - exp_accum.data, + exp_buffer.data, + expZ_accum.data, censoring.data, ordering.data, rankmin.data, @@ -201,7 +211,8 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, ncase) _cox_hessian(hessian.data, - linear_pred.data, + exp_buffer.data, + right_vector.data, outer_1st_accum.data, outer_2nd_accum.data, censoring.data, From 74098c72c1912ca9ce704c4caeaf044e2716d278 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 22 Feb 2020 14:59:24 -0800 Subject: [PATCH 013/187] added case weights but results don't quite agree with R --- selectinf/algorithms/cox_utils.pyx | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/selectinf/algorithms/cox_utils.pyx b/selectinf/algorithms/cox_utils.pyx index 63f6c2856..01244835b 100644 --- a/selectinf/algorithms/cox_utils.pyx +++ b/selectinf/algorithms/cox_utils.pyx @@ -13,6 +13,7 @@ cdef extern from "cox_fns.h": void _update_cox_exp(double *linear_pred_ptr, # Linear term in objective double *exp_ptr, # stores exp(eta) double *exp_accum_ptr, # inner accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -21,8 +22,9 @@ cdef extern from "cox_fns.h": void _update_cox_expZ(double *linear_pred_ptr, # Linear term in objective double *right_vector_ptr, # Linear term in objective - double *exp_ptr, # stores exp(eta) + double *exp_ptr, # stores exp(eta) double *expZ_accum_ptr, # inner accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -32,6 +34,7 @@ cdef extern from "cox_fns.h": void _update_outer_1st(double *linear_pred_ptr, # Linear term in objective double *exp_accum_ptr, # inner accumulation vector double *outer_accum_1st_ptr, # outer accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -42,6 +45,7 @@ cdef extern from "cox_fns.h": double *exp_accum_ptr, # inner accumulation vector Ze^{\eta} double *expZ_accum_ptr, # inner accumulation vector e^{\eta} double *outer_accum_2nd_ptr, # outer accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -51,6 +55,7 @@ cdef extern from "cox_fns.h": double _cox_objective(double *linear_pred_ptr, # Linear term in objective double *inner_accum_ptr, # inner accumulation vector double *outer_accum_1st_ptr, # outer accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -61,6 +66,7 @@ cdef extern from "cox_fns.h": void _cox_gradient(double *gradient_ptr, # Where gradient is stored double *exp_ptr, # stores exp(eta) double *outer_accum_1st_ptr, # outer accumulation vector + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmin_ptr, # 0-based ranking with min tie breaking @@ -73,6 +79,7 @@ cdef extern from "cox_fns.h": double *right_vector_ptr, # Right vector in Hessian double *outer_accum_1st_ptr, # outer accumulation vector used in outer prod "mean" double *outer_accum_2nd_ptr, # outer accumulation vector used in "2nd" moment + double *case_weight_ptr, # case weights long *censoring_ptr, # censoring indicator long *ordering_ptr, # 0-based ordering of times long *rankmax_ptr, # 0-based ranking with max tie breaking @@ -83,6 +90,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, cnp.ndarray[DTYPE_float_t, ndim=1] exp_buffer, cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] case_weight, cnp.ndarray[DTYPE_int_t, ndim=1] censoring, cnp.ndarray[DTYPE_int_t, ndim=1] ordering, cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, @@ -92,6 +100,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, _update_cox_exp(linear_pred.data, exp_buffer.data, exp_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -100,6 +109,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, _update_outer_1st(linear_pred.data, exp_accum.data, outer_1st_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -108,6 +118,7 @@ def cox_objective(cnp.ndarray[DTYPE_float_t, ndim=1] linear_pred, return _cox_objective(linear_pred.data, exp_accum.data, outer_1st_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -119,6 +130,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, cnp.ndarray[DTYPE_float_t, ndim=1] exp_buffer, cnp.ndarray[DTYPE_float_t, ndim=1] exp_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] case_weight, cnp.ndarray[DTYPE_int_t, ndim=1] censoring, cnp.ndarray[DTYPE_int_t, ndim=1] ordering, cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, @@ -133,6 +145,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, _update_cox_exp(linear_pred.data, exp_buffer.data, exp_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -141,6 +154,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, _update_outer_1st(linear_pred.data, exp_accum.data, outer_1st_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -149,6 +163,7 @@ def cox_gradient(cnp.ndarray[DTYPE_float_t, ndim=1] gradient, _cox_gradient(gradient.data, exp_buffer.data, outer_1st_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -165,6 +180,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, cnp.ndarray[DTYPE_float_t, ndim=1] expZ_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_1st_accum, cnp.ndarray[DTYPE_float_t, ndim=1] outer_2nd_accum, + cnp.ndarray[DTYPE_float_t, ndim=1] case_weight, cnp.ndarray[DTYPE_int_t, ndim=1] censoring, cnp.ndarray[DTYPE_int_t, ndim=1] ordering, cnp.ndarray[DTYPE_int_t, ndim=1] rankmin, @@ -179,6 +195,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, _update_cox_exp(linear_pred.data, exp_buffer.data, exp_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -187,6 +204,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, _update_outer_1st(linear_pred.data, exp_accum.data, outer_1st_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -196,6 +214,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, right_vector.data, exp_buffer.data, expZ_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -205,6 +224,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, exp_accum.data, expZ_accum.data, outer_2nd_accum.data, + case_weight.data, censoring.data, ordering.data, rankmin.data, @@ -215,6 +235,7 @@ def cox_hessian(cnp.ndarray[DTYPE_float_t, ndim=1] hessian, right_vector.data, outer_1st_accum.data, outer_2nd_accum.data, + case_weight.data, censoring.data, ordering.data, rankmax.data, From 94e2c6a9033d923f950486ad9a783d54b409de76 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Sat, 22 Feb 2020 14:59:35 -0800 Subject: [PATCH 014/187] updated cox code --- C-software | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C-software b/C-software index 3f5d8f344..8c36cc18b 160000 --- a/C-software +++ b/C-software @@ -1 +1 @@ -Subproject commit 3f5d8f3447ebf4670c6f12cc0bfb970b1e1872d5 +Subproject commit 8c36cc18b1c78c139d8cba4ecbb8875eb8275b20 From ffbc2e1c75a6fbc20857b45eb969cbe7680228b1 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 20:58:48 -0700 Subject: [PATCH 015/187] updating C software --- C-software | 2 +- selectinf/sampling/sequential.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/C-software b/C-software index 8c36cc18b..7a3d663fe 160000 --- a/C-software +++ b/C-software @@ -1 +1 @@ -Subproject commit 8c36cc18b1c78c139d8cba4ecbb8875eb8275b20 +Subproject commit 7a3d663feadaf6c61400359fe8fe95a61099b645 diff --git a/selectinf/sampling/sequential.py b/selectinf/sampling/sequential.py index 06a018895..450ae81c8 100644 --- a/selectinf/sampling/sequential.py +++ b/selectinf/sampling/sequential.py @@ -10,7 +10,7 @@ def sample(white_constraint, nsample, proposal_sigma=0.2, - temps=np.linspace(0, 50, 51.)): + temps=np.linspace(0, 50., 51)): """ Build up an approximately constrained Gaussian based on relaxations of the constraint. From b35e9352a158a9f30db813b9e7b6a183933edce4 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 21:00:28 -0700 Subject: [PATCH 016/187] fixing imports --- selectinf/algorithms/lasso.py | 42 +++++++++---------- selectinf/algorithms/tests/test_covtest.py | 1 - .../algorithms/tests/test_debiased_lasso.py | 1 - selectinf/algorithms/tests/test_lasso.py | 17 ++++---- selectinf/algorithms/tests/test_softmax.py | 3 +- selectinf/algorithms/tests/test_sqrt_lasso.py | 3 +- .../constraints/tests/test_quadratic_tests.py | 1 - selectinf/randomized/tests/test_BH.py | 3 +- .../sampling/tests/test_sample_sphere.py | 2 +- selectinf/sampling/tests/test_sequential.py | 5 +-- selectinf/tests/decorators.py | 7 +--- selectinf/truncated/tests/test_truncated.py | 5 +-- 12 files changed, 39 insertions(+), 51 deletions(-) diff --git a/selectinf/algorithms/lasso.py b/selectinf/algorithms/lasso.py index fa35e5ed2..f885eb964 100644 --- a/selectinf/algorithms/lasso.py +++ b/selectinf/algorithms/lasso.py @@ -21,7 +21,6 @@ from regreg.api import (glm, weighted_l1norm, simple_problem, - coxph as coxph_obj, smooth_sum, squared_error, identity_quadratic, @@ -470,13 +469,13 @@ def logistic(klass, covariance_estimator=covariance_estimator) @classmethod - def coxph(klass, - X, - times, - status, - feature_weights, - covariance_estimator=None, - quadratic=None): + def cox(klass, + X, + times, + status, + feature_weights, + covariance_estimator=None, + quadratic=None): r""" Cox proportional hazards LASSO with feature weights. Objective function is @@ -521,7 +520,7 @@ def coxph(klass, coordinates of the gradient of the likelihood at the unpenalized estimator. """ - loglike = coxph_obj(X, times, status, quadratic=quadratic) + loglike = glm.cox(X, times, status, quadratic=quadratic) return klass(loglike, feature_weights, covariance_estimator=covariance_estimator) @@ -1003,14 +1002,14 @@ def poisson(klass, return klass(loglike1, loglike2, loglike, feature_weights) @classmethod - def coxph(klass, - X, - times, - status, - feature_weights, - split_frac=0.9, - sigma=1., - stage_one=None): + def cox(klass, + X, + times, + status, + feature_weights, + split_frac=0.9, + sigma=1., + stage_one=None): n, p = X.shape if stage_one is None: @@ -1025,9 +1024,9 @@ def coxph(klass, times1, X1, status1 = times[stage_one], X[stage_one], status[stage_one] times2, X2, status2 = times[stage_two], X[stage_two], status[stage_two] - loglike = coxph_obj(X, times, status) - loglike1 = coxph_obj(X1, times1, status1) - loglike2 = coxph_obj(X2, times2, status2) + loglike = glm.cox(X, times, status) + loglike1 = glm.cox(X1, times1, status1) + loglike2 = glm.cox(X2, times2, status2) return klass(loglike1, loglike2, loglike, feature_weights) @@ -1878,7 +1877,8 @@ def fit(self, # Needed for finding truncation intervals - self._Qbeta_bar = X.T.dot(W * X.dot(lasso_solution)) - self.loglike.smooth_objective(lasso_solution, 'grad') + self._Qbeta_bar = (X.T.dot(W * X.dot(lasso_solution)) - + self.loglike.smooth_objective(lasso_solution, 'grad')) self._W = W if n > p and self.approximate_inverse is None: diff --git a/selectinf/algorithms/tests/test_covtest.py b/selectinf/algorithms/tests/test_covtest.py index 9f0a2c2be..f80981659 100644 --- a/selectinf/algorithms/tests/test_covtest.py +++ b/selectinf/algorithms/tests/test_covtest.py @@ -1,7 +1,6 @@ import itertools import numpy as np -import numpy.testing.decorators as dec from ...tests.instance import gaussian_instance from ...tests.flags import SET_SEED, SMALL_SAMPLES diff --git a/selectinf/algorithms/tests/test_debiased_lasso.py b/selectinf/algorithms/tests/test_debiased_lasso.py index 51eb94f94..161cb5196 100644 --- a/selectinf/algorithms/tests/test_debiased_lasso.py +++ b/selectinf/algorithms/tests/test_debiased_lasso.py @@ -1,6 +1,5 @@ import numpy as np import nose.tools as nt -import numpy.testing.decorators as dec from ...tests.instance import gaussian_instance as instance diff --git a/selectinf/algorithms/tests/test_lasso.py b/selectinf/algorithms/tests/test_lasso.py index 172535b10..3b1a3186e 100644 --- a/selectinf/algorithms/tests/test_lasso.py +++ b/selectinf/algorithms/tests/test_lasso.py @@ -1,6 +1,5 @@ import numpy as np, pandas as pd import nose.tools as nt -import numpy.testing.decorators as dec from itertools import product from ...tests.flags import SMALL_SAMPLES @@ -143,7 +142,7 @@ def test_poisson(): return L, C, P @set_seed_iftrue(True) -@dec.skipif(not statsmodels_available, "needs statsmodels") +@np.testing.dec.skipif(not statsmodels_available, "needs statsmodels") def test_coxph(): Q = rr.identity_quadratic(0.01, 0, np.ones(5), 0) @@ -151,10 +150,10 @@ def test_coxph(): T = np.random.standard_exponential(100) S = np.random.binomial(1, 0.5, size=(100,)) - L = lasso.coxph(X, T, S, 0.1, quadratic=Q) + L = lasso.cox(X, T, S, 0.1, quadratic=Q) L.fit() - L = lasso.coxph(X, T, S, 0.1, quadratic=Q) + L = lasso.cox(X, T, S, 0.1, quadratic=Q) L.fit() C = L.constraints @@ -450,7 +449,7 @@ def test_data_carving_poisson(n=500, @wait_for_return_value() @set_seed_iftrue(True) -@dec.skipif(not statsmodels_available, "needs statsmodels") +@np.testing.dec.skipif(not statsmodels_available, "needs statsmodels") @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, burnin=10) def test_data_carving_coxph(n=400, p=20, @@ -478,14 +477,14 @@ def test_data_carving_coxph(n=400, lam_theor = 10. * np.ones(p) lam_theor[0] = 0. - DC = data_carving.coxph(X, T, S, feature_weights=lam_theor, - stage_one=stage_one) + DC = data_carving.cox(X, T, S, feature_weights=lam_theor, + stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n*split_frac): - DS = data_splitting.coxph(X, T, S, feature_weights=lam_theor, - stage_one=stage_one) + DS = data_splitting.cox(X, T, S, feature_weights=lam_theor, + stage_one=stage_one) DS.fit(use_full_cov=True) data_split = True else: diff --git a/selectinf/algorithms/tests/test_softmax.py b/selectinf/algorithms/tests/test_softmax.py index 329f847d5..1f6e64664 100644 --- a/selectinf/algorithms/tests/test_softmax.py +++ b/selectinf/algorithms/tests/test_softmax.py @@ -1,9 +1,8 @@ import numpy as np -import numpy.testing.decorators as dec from itertools import product from ..softmax import softmax_objective -@dec.skipif(True, "need some tests for softmax objective") +@np.testing.dec.skipif(True, "need some tests for softmax objective") def test_softmax(): raise ValueError('need some tests for softmax objective') diff --git a/selectinf/algorithms/tests/test_sqrt_lasso.py b/selectinf/algorithms/tests/test_sqrt_lasso.py index 0d05495d1..86edb6078 100644 --- a/selectinf/algorithms/tests/test_sqrt_lasso.py +++ b/selectinf/algorithms/tests/test_sqrt_lasso.py @@ -1,7 +1,6 @@ from __future__ import division import numpy as np -import numpy.testing.decorators as dec import nose.tools as nt import regreg.api as rr @@ -23,7 +22,7 @@ @wait_for_return_value() @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=10, burnin=10, ndraw=10) -@dec.slow +@np.testing.dec.slow def test_goodness_of_fit(n=20, p=25, s=10, sigma=20., nsim=10, burnin=2000, ndraw=8000): P = [] diff --git a/selectinf/constraints/tests/test_quadratic_tests.py b/selectinf/constraints/tests/test_quadratic_tests.py index 5ea4e2767..1a1698f1a 100644 --- a/selectinf/constraints/tests/test_quadratic_tests.py +++ b/selectinf/constraints/tests/test_quadratic_tests.py @@ -1,7 +1,6 @@ import numpy as np from scipy.stats import chi import nose.tools as nt -import numpy.testing.decorators as dec from ...distributions import chisq from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue, rpy_test_safe diff --git a/selectinf/randomized/tests/test_BH.py b/selectinf/randomized/tests/test_BH.py index a6fe5851f..07192cfcb 100644 --- a/selectinf/randomized/tests/test_BH.py +++ b/selectinf/randomized/tests/test_BH.py @@ -1,5 +1,4 @@ import numpy as np -import numpy.testing.decorators as dec from scipy.stats import norm as ndist @@ -42,7 +41,7 @@ def BH_cutoff(): np.testing.assert_allclose(sorted(BHfilter(2 * ndist.sf(np.fabs(Z)), q=0.2)), sorted(stepup_selection(Z, BH_cutoffs)[1])) -@dec.skipif(True, "independent estimator test not working") +@np.testing.dec.skipif(True, "independent estimator test not working") def test_independent_estimator(n=100, n1=50, q=0.2, signal=3, p=100): Z = np.random.standard_normal((n, p)) diff --git a/selectinf/sampling/tests/test_sample_sphere.py b/selectinf/sampling/tests/test_sample_sphere.py index e1be9724c..07d858b0c 100644 --- a/selectinf/sampling/tests/test_sample_sphere.py +++ b/selectinf/sampling/tests/test_sample_sphere.py @@ -90,7 +90,7 @@ def test_sample_sphere(burnin=1000, s2 = AC.sample_from_sphere(con, initial, ndraw=ndraw, burnin=burnin) return s1, s2 -@dec.slow +@np.testing.dec.slow @set_seed_iftrue(SET_SEED, 20) @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=10, ndraw=10, burnin=10) def test_distribution_sphere(n=15, p=10, sigma=1., diff --git a/selectinf/sampling/tests/test_sequential.py b/selectinf/sampling/tests/test_sequential.py index b4634bfd3..a4f34a2b6 100644 --- a/selectinf/sampling/tests/test_sequential.py +++ b/selectinf/sampling/tests/test_sequential.py @@ -1,5 +1,4 @@ import numpy as np -import numpy.testing.decorators as dec from scipy.stats import norm as ndist from ...constraints.affine import constraints @@ -7,13 +6,13 @@ from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue from ...tests.flags import SMALL_SAMPLES, SET_SEED -@dec.slow +@np.testing.dec.slow @set_seed_iftrue(SET_SEED) @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, nsim=10) def test_sequentially_constrained(ndraw=100, nsim=50): S = -np.identity(10)[:3] b = -6 * np.ones(3) C = constraints(S, b) - W = sample(C, nsim, temps=np.linspace(0, 200, 1001)) + W = sample(C, nsim, temps=np.linspace(0, 200., 1001)) U = np.linspace(0, 1, 101) diff --git a/selectinf/tests/decorators.py b/selectinf/tests/decorators.py index 37407e65c..125a048ad 100644 --- a/selectinf/tests/decorators.py +++ b/selectinf/tests/decorators.py @@ -5,10 +5,7 @@ import nose import nose.tools -try: - from numpy.testing.decorators import SkipTest -except (ImportError, AttributeError): - from numpy.testing import SkipTest +from numpy.testing import SkipTest def set_seed_iftrue(condition, seed=10): """ @@ -209,7 +206,7 @@ def modified_gen(*args, **kwargs): for x in f(*args, **kwargs_cp): yield x else: - raise np.testing.decorators.SkipTest(get_msg(f, msg)) + raise SkipTest(get_msg(f, msg)) # Choose the right modified to use when building the actual decorator. if nose.util.isgenerator(f): diff --git a/selectinf/truncated/tests/test_truncated.py b/selectinf/truncated/tests/test_truncated.py index a1adfa19e..b5ddaaeb1 100644 --- a/selectinf/truncated/tests/test_truncated.py +++ b/selectinf/truncated/tests/test_truncated.py @@ -1,7 +1,6 @@ from __future__ import print_function import nose.tools as nt import numpy as np -import numpy.testing.decorators as dec from ..gaussian import truncated_gaussian, truncated_gaussian_old from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue @@ -25,7 +24,7 @@ def test_sigma(): np.around(np.array(tg2.equal_tailed_interval(Z,0.05)), 4)) @set_seed_iftrue(SET_SEED) -@dec.skipif(True, 'checking coverage: this is random with highish failure rate') +@np.testing.dec.skipif(True, 'checking coverage: this is random with highish failure rate') @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=100) def test_equal_tailed_coverage(nsim=1000): @@ -44,7 +43,7 @@ def test_equal_tailed_coverage(nsim=1000): nt.assert_true(np.fabs(coverage - (1-alpha)*nsim) < 2*SE) @set_seed_iftrue(SET_SEED) -@dec.skipif(True, 'really slow') +@np.testing.dec.skipif(True, 'really slow') @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=100) def test_UMAU_coverage(nsim=1000): From 0b7d566bf3a41654d6081eeb8926f42b5ef05f58 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 21:16:42 -0700 Subject: [PATCH 017/187] some regreg changes to incorporate, more np decorators --- selectinf/algorithms/sqrt_lasso.py | 3 ++- selectinf/algorithms/tests/test_compareR.py | 5 +++-- selectinf/constraints/tests/test_affine.py | 3 ++- selectinf/randomized/tests/test_group_lasso.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/selectinf/algorithms/sqrt_lasso.py b/selectinf/algorithms/sqrt_lasso.py index e29409892..8bb03c5ef 100644 --- a/selectinf/algorithms/sqrt_lasso.py +++ b/selectinf/algorithms/sqrt_lasso.py @@ -10,6 +10,7 @@ # regreg http://github.com/regreg import regreg.api as rr +from regreg.atoms.mixed_lasso import NONNEGATIVE import regreg.affine as ra from regreg.smooth.glm import gaussian_loglike from regreg.affine import astransform @@ -427,7 +428,7 @@ def solve_sqrt_lasso_skinny(X, Y, weights=None, initial=None, quadratic=None, so weights = lam * np.ones((p,)) weight_dict = dict(zip(np.arange(p), 2 * weights)) - penalty = rr.mixed_lasso(list(np.arange(p)) + [rr.NONNEGATIVE], lagrange=1., + penalty = rr.mixed_lasso(list(np.arange(p)) + [NONNEGATIVE], lagrange=1., weights=weight_dict) loss = sqlasso_objective_skinny(X, Y) diff --git a/selectinf/algorithms/tests/test_compareR.py b/selectinf/algorithms/tests/test_compareR.py index d7ef21b76..51ba177cf 100644 --- a/selectinf/algorithms/tests/test_compareR.py +++ b/selectinf/algorithms/tests/test_compareR.py @@ -3,6 +3,7 @@ import numpy as np, pandas as pd import regreg.api as rr import nose.tools as nt +from numpy.testing import dec try: import rpy2.robjects as rpy @@ -22,7 +23,7 @@ from ...randomized.lasso import lasso as rlasso, selected_targets, full_targets, debiased_targets from ...tests.instance import gaussian_instance, logistic_instance -@np.testing.dec.skipif(not rpy2_available, msg="rpy2 not available, skipping test") +@dec.skipif(not rpy2_available, msg="rpy2 not available, skipping test") def test_fixed_lambda(): """ Check that Gaussian LASSO results agree with R @@ -240,7 +241,7 @@ def test_coxph(): beta_hat = np.asarray(rpy.r('beta_hat')) x = np.asarray(rpy.r('x')) - L = lasso.coxph(x, tim, status, 1.5) + L = lasso.cox(x, tim, status, 1.5) beta2 = L.fit() G1 = L.loglike.gradient(beta_hat) diff --git a/selectinf/constraints/tests/test_affine.py b/selectinf/constraints/tests/test_affine.py index 82cfad9db..dca0f70a5 100644 --- a/selectinf/constraints/tests/test_affine.py +++ b/selectinf/constraints/tests/test_affine.py @@ -2,6 +2,7 @@ import nose import numpy as np +from numpy.testing import dec from scipy.stats import chi import nose.tools as nt @@ -168,7 +169,7 @@ def test_sampling(): np.outer(V.mean(0), V.mean(0)) - S) < 0.01) @set_seed_iftrue(SET_SEED) -@np.testing.decorators.skipif(True, msg="optimal tilt undefined -- need to implement softmax version") +@dec.skipif(True, msg="optimal tilt undefined -- need to implement softmax version") def test_optimal_tilt(): A = np.vstack(-np.identity(4)) diff --git a/selectinf/randomized/tests/test_group_lasso.py b/selectinf/randomized/tests/test_group_lasso.py index 9cc866cf6..0f1380ffb 100644 --- a/selectinf/randomized/tests/test_group_lasso.py +++ b/selectinf/randomized/tests/test_group_lasso.py @@ -268,7 +268,7 @@ def test_mixed(n=400, which += which_group return pval[beta[which] == 0], pval[beta[which] != 0] -@set_seed_iftrue(SET_SEED) +@set_seed_iftrue(True) def test_all_targets(n=100, p=20, signal_fac=1.5, s=5, sigma=3, rho=0.4): for target in ['full', 'selected', 'debiased']: test_group_lasso(n=n, From 9b6a065f8d32560883b969ca9b4191c2432b2cd6 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 21:22:02 -0700 Subject: [PATCH 018/187] one more missing import --- selectinf/sampling/tests/test_sample_sphere.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/sampling/tests/test_sample_sphere.py b/selectinf/sampling/tests/test_sample_sphere.py index 07d858b0c..cef1b08ec 100644 --- a/selectinf/sampling/tests/test_sample_sphere.py +++ b/selectinf/sampling/tests/test_sample_sphere.py @@ -2,7 +2,7 @@ import nose import nose.tools as nt import numpy as np -import numpy.testing.decorators as dec +from numpy.testing import dec from scipy.stats import chi import nose.tools as nt From 82bc49170de606ecb20120b6d91e8c0942e73758 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 21:31:20 -0700 Subject: [PATCH 019/187] fix to travis yaml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7b9c78817..958fc13bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -93,7 +93,7 @@ install: - if [ "$RUN_R_TESTS" ]; then sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; - Rscript -e "library(Rcpp); Rcpp::compileAttributes('selectiveInference')"; + # Rscript -e "library(Rcpp); Rcpp::compileAttributes('selectiveInference')"; -- dont need this line sudo Rscript -e "install.packages(c('glmnet', 'intervals', 'adaptMCMC', 'SLOPE', 'knockoff'), repos='http://cloud.r-project.org')"; git clone https://github.com/jonathan-taylor/R-selective.git; cd R-selective; From 357cb4c5908638044b6c092d888ca5e635697f9d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 21:35:45 -0700 Subject: [PATCH 020/187] comment causing problem in travis --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 958fc13bb..63d030948 100644 --- a/.travis.yml +++ b/.travis.yml @@ -93,7 +93,6 @@ install: - if [ "$RUN_R_TESTS" ]; then sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; - # Rscript -e "library(Rcpp); Rcpp::compileAttributes('selectiveInference')"; -- dont need this line sudo Rscript -e "install.packages(c('glmnet', 'intervals', 'adaptMCMC', 'SLOPE', 'knockoff'), repos='http://cloud.r-project.org')"; git clone https://github.com/jonathan-taylor/R-selective.git; cd R-selective; From 38eb9983e9d834af96775c092e24604dc6ca5e92 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 22:16:36 -0700 Subject: [PATCH 021/187] older version of glmnet for older version of R --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 63d030948..e24d2afe0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -93,7 +93,9 @@ install: - if [ "$RUN_R_TESTS" ]; then sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; - sudo Rscript -e "install.packages(c('glmnet', 'intervals', 'adaptMCMC', 'SLOPE', 'knockoff'), repos='http://cloud.r-project.org')"; + sudo Rscript -e "install.packages(c('devtools', 'intervals', 'adaptMCMC', 'SLOPE'), repos='http://cloud.r-project.org')"; + sudo Rscript -e "require(devtools); install_version('glmnet', version='2.0.18', repos='http://cloud.r-project.org')"; + sudo Rscript -e "install.packages('knockoff', repos='http://cloud.r-project.org')"; git clone https://github.com/jonathan-taylor/R-selective.git; cd R-selective; git submodule init; From 798718a03e8fd8b1b3841450dbe4fd75a391ed60 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 22:19:09 -0700 Subject: [PATCH 022/187] forcing to be an ndarray --- selectinf/tests/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/tests/instance.py b/selectinf/tests/instance.py index 8c096b9ab..15826a148 100644 --- a/selectinf/tests/instance.py +++ b/selectinf/tests/instance.py @@ -364,7 +364,7 @@ def HIV_NRTI(drug='3TC', NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) X_NRTI = np.array(NRTI_specific, np.float) - Y = NRTI[drug] # shorthand + Y = np.asarray(NRTI[drug]) # shorthand keep = ~np.isnan(Y).astype(np.bool) X_NRTI = X_NRTI[np.nonzero(keep)]; Y=Y[keep] Y = np.array(np.log(Y), np.float); From f552441122d01f3fd37fbe7d51a0e97e275b971e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 22:33:06 -0700 Subject: [PATCH 023/187] fixing version of glmnet --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e24d2afe0..cc6a7ba64 100644 --- a/.travis.yml +++ b/.travis.yml @@ -94,7 +94,7 @@ install: sudo apt-get install -y r-base r-base-dev r-cran-devtools r-cran-rcpp; pip install rpy2 statsmodels -c constraints.txt ; sudo Rscript -e "install.packages(c('devtools', 'intervals', 'adaptMCMC', 'SLOPE'), repos='http://cloud.r-project.org')"; - sudo Rscript -e "require(devtools); install_version('glmnet', version='2.0.18', repos='http://cloud.r-project.org')"; + sudo Rscript -e "require(devtools); install_version('glmnet', version='2.0-18', repos='http://cloud.r-project.org')"; sudo Rscript -e "install.packages('knockoff', repos='http://cloud.r-project.org')"; git clone https://github.com/jonathan-taylor/R-selective.git; cd R-selective; From 048b18c916a2f3583b40da77dd55bbb316d49b4e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 23:01:00 -0700 Subject: [PATCH 024/187] try doc build with 3.6 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cc6a7ba64..dd7a5620c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -70,7 +70,7 @@ matrix: env: - INSTALL_TYPE=requirements - DEPENDS= - - python: 3.6 + - python: 3.5 sudo: true env: - DOC_BUILD=1 From 2808d308d820cb4aef6b82e195286a6e38016734 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 1 Apr 2020 23:31:13 -0700 Subject: [PATCH 025/187] removing doc build for now --- .travis.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index dd7a5620c..24d8ebb83 100644 --- a/.travis.yml +++ b/.travis.yml @@ -70,10 +70,6 @@ matrix: env: - INSTALL_TYPE=requirements - DEPENDS= - - python: 3.5 - sudo: true - env: - - DOC_BUILD=1 before_install: - source travis-tools/utils.sh From 44bdd2b75bde1e12a1a4b0937915cc1fde88073d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 2 Apr 2020 00:37:35 -0700 Subject: [PATCH 026/187] py35 build on appveyor failing for pandas / cython issue --- appveyor.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 86ae986cd..6e121c1ee 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -28,9 +28,6 @@ environment: - PYTHON: C:\Python36-x64 NP_BUILD_DEP: "1.13.3" NP_TEST_DEP: "1.13.3" - - PYTHON: C:\Python35-x64 - NP_BUILD_DEP: "1.13.3" - NP_TEST_DEP: "1.13.3" - PYTHON: C:\Python37 NP_BUILD_DEP: "1.14.5" @@ -39,9 +36,14 @@ environment: - PYTHON: C:\Python36 NP_BUILD_DEP: "1.13.3" NP_TEST_DEP: "1.13.3" - - PYTHON: C:\Python35 - NP_BUILD_DEP: "1.13.3" - NP_TEST_DEP: "1.13.3" + + # problem with pandas + cython for py35 + # - PYTHON: C:\Python35-x64 + # NP_BUILD_DEP: "1.13.3" + # NP_TEST_DEP: "1.13.3" + # - PYTHON: C:\Python35 + # NP_BUILD_DEP: "1.13.3" + # NP_TEST_DEP: "1.13.3" install: - cmd: echo "Using cmd" From 245411ab564b9a4b7b4e6dbec95caef855c06cd7 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Thu, 16 Apr 2020 15:01:56 -0400 Subject: [PATCH 027/187] added class for posterior sampling --- selectinf/algorithms/api.py | 8 +- selectinf/algorithms/sqrt_lasso.py | 6 +- selectinf/constraints/affine.py | 12 +- selectinf/randomized/lasso.py | 13 +- selectinf/randomized/posterior_inference.py | 137 +++++++++++++++++++ selectinf/randomized/query.py | 2 +- selectinf/randomized/tests/test_lasso.py | 14 +- selectinf/randomized/tests/test_posterior.py | 70 ++++++++++ selectinf/sampling/api.py | 2 +- 9 files changed, 237 insertions(+), 27 deletions(-) create mode 100644 selectinf/randomized/posterior_inference.py create mode 100644 selectinf/randomized/tests/test_posterior.py diff --git a/selectinf/algorithms/api.py b/selectinf/algorithms/api.py index f15caa897..786bb2f5e 100644 --- a/selectinf/algorithms/api.py +++ b/selectinf/algorithms/api.py @@ -1,13 +1,13 @@ -from .lasso import (lasso, +from .lasso import (lasso, ROSI, data_carving as data_carving_lasso, additive_noise as additive_noise_lasso) -from .sqrt_lasso import (choose_lambda as choose_lambda_sqrt_lasso, +from .sqrt_lasso import (choose_lambda as choose_lambda_sqrt_lasso, solve_sqrt_lasso) -from .forward_step import (forward_step, +from .forward_step import (forward_step, info_crit_stop) -from .covtest import (covtest, +from .covtest import (covtest, selected_covtest) diff --git a/selectinf/algorithms/sqrt_lasso.py b/selectinf/algorithms/sqrt_lasso.py index 8bb03c5ef..36512c082 100644 --- a/selectinf/algorithms/sqrt_lasso.py +++ b/selectinf/algorithms/sqrt_lasso.py @@ -15,10 +15,10 @@ from regreg.smooth.glm import gaussian_loglike from regreg.affine import astransform -from ..constraints.affine import (constraints as affine_constraints, +from selectinf.constraints.affine import (constraints as affine_constraints, sample_from_sphere) -from ..distributions.discrete_multiparameter import multiparameter_family -from ..distributions.discrete_family import discrete_family +from selectinf.distributions.discrete_multiparameter import multiparameter_family +from selectinf.distributions.discrete_family import discrete_family class sqlasso_objective(rr.smooth_atom): """ diff --git a/selectinf/constraints/affine.py b/selectinf/constraints/affine.py index da4f30817..8be98688b 100644 --- a/selectinf/constraints/affine.py +++ b/selectinf/constraints/affine.py @@ -17,18 +17,18 @@ import numpy as np -from ..distributions.pvalue import truncnorm_cdf, norm_interval -from ..truncated.gaussian import truncated_gaussian, truncated_gaussian_old -from ..sampling.api import (sample_truncnorm_white, +from selectinf.distributions.pvalue import truncnorm_cdf, norm_interval +from selectinf.truncated.gaussian import truncated_gaussian, truncated_gaussian_old +from selectinf.sampling.api import (sample_truncnorm_white, sample_truncnorm_white_sphere, sample_truncnorm_white_ball) -from ..distributions.chain import (reversible_markov_chain, +from selectinf.distributions.chain import (reversible_markov_chain, parallel_test, serial_test) -from .estimation import optimal_tilt +from selectinf.constraints.estimation import optimal_tilt -from ..distributions.discrete_family import discrete_family +from selectinf.distributions.discrete_family import discrete_family from mpmath import mp WARNINGS = False diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 9c73512ca..c9a5ec466 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -7,13 +7,13 @@ import regreg.api as rr -from ..algorithms.sqrt_lasso import solve_sqrt_lasso, choose_lambda +from selectinf.algorithms.sqrt_lasso import solve_sqrt_lasso, choose_lambda -from .query import gaussian_query +from selectinf.randomized.query import gaussian_query -from .randomization import randomization -from ..base import restricted_estimator -from ..algorithms.debiased_lasso import (debiasing_matrix, +from selectinf.randomized.randomization import randomization +from selectinf.base import restricted_estimator +from selectinf.algorithms.debiased_lasso import (debiasing_matrix, pseudoinverse_debiasing_matrix) #### High dimensional version @@ -224,6 +224,9 @@ def signed_basis_vector(p, j, s): if num_opt_var > 0: self._setup_sampler(*self._setup_sampler_data) + self.A_scaling = A_scaling + self.b_scaling = b_scaling + return active_signs def _solve_randomized_problem(self, diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py new file mode 100644 index 000000000..fa4f2bd1d --- /dev/null +++ b/selectinf/randomized/posterior_inference.py @@ -0,0 +1,137 @@ +from __future__ import division, print_function +import numpy as np, sys + +from selectinf.randomized.selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from scipy.stats import norm as ndist + +class posterior_inference_lasso(): + + def __init__(self, + observed_target, + cov_target, + cov_target_score, + feasible_point, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset, + initial_estimate): + + self.ntarget = cov_target.shape[0] + self.nopt = cond_cov.shape[0] + + self.cond_precision = np.linalg.inv(cond_cov) + self.prec_target = np.linalg.inv(cov_target) + + self.observed_target = observed_target + self.cov_target_score = cov_target_score + self.logdens_linear = logdens_linear + + self.feasible_point = feasible_point + self.cond_mean = cond_mean + self.linear_part = linear_part + self.offset = offset + + self.initial_estimate = initial_estimate + + self.set_marginal_parameters() + + def set_marginal_parameters(self): + + target_linear = -self.logdens_linear.dot(self.cov_target_score.T.dot(self.prec_target)) + + implied_precision = np.zeros((self.ntarget + self.nopt, self.ntarget + self.nopt)) + implied_precision[:self.ntarget, :self.ntarget] = (self.prec_target + target_linear.T.dot(self.cond_precision.dot(target_linear))) + implied_precision[:self.ntarget, self.ntarget:] = -target_linear.T.dot(self.cond_precision) + implied_precision[self.ntarget:, :self.ntarget] = (-target_linear.T.dot(self.cond_precision)).T + implied_precision[self.ntarget:, self.ntarget:] = self.cond_precision + + implied_cov = np.linalg.inv(implied_precision) + self.linear_coef = implied_cov[self.ntarget:, :self.ntarget].dot(self.prec_target) + + target_offset = self.cond_mean - target_linear.dot(self.observed_target) + M = implied_cov[self.ntarget:, self.ntarget:].dot(self.cond_precision.dot(target_offset)) + N = -target_linear.T.dot(self.cond_precision).dot(target_offset) + self.offset_coef = implied_cov[self.ntarget:, :self.ntarget].dot(N) + M + + self.cov_marginal = implied_cov[self.ntarget:, self.ntarget:] + + def prior(self, target_parameter, prior_var=100.): + + grad_prior = -target_parameter/prior_var + log_prior = -np.linalg.norm(target_parameter)/(2.*prior_var) + return grad_prior, log_prior + + def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): + + mean_marginal = self.linear_coef.dot(target_parameter) + self.offset_coef + prec_marginal = np.linalg.inv(self.cov_marginal) + conjugate_marginal = prec_marginal.dot(mean_marginal) + + solver = solve_barrier_affine_C + + val, soln, hess = solver(conjugate_marginal, + prec_marginal, + self.feasible_point, + self.linear_part, + self.offset, + **solve_args) + + log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal)/2 + + log_lik = -((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2.\ + - log_normalizer + + grad_lik = self.prec_target.dot(self.observed_target) - self.prec_target.dot(target_parameter) + \ + -self.linear_coef.T.dot(prec_marginal.dot(soln)- conjugate_marginal) + + grad_prior, log_prior = self.prior(target_parameter) + return grad_lik + grad_prior, log_lik + log_prior + + def posterior_sampler(self, nsample= 2000, nburnin=100, step=1.): + + state = self.initial_estimate + stepsize = 1. / (step * self.ntarget) + + sampler = langevin(state, self.log_posterior, stepsize) + samples = np.zeros((nsample, self.ntarget)) + + for i in range(nsample): + sampler.next() + sys.stderr.write("sample number: " + str(i) + "sample: " + str(sampler.state.copy())+ "\n") + samples[i, :] = sampler.state.copy() + return samples[nburnin:, :] + +class langevin(object): + + def __init__(self, + initial_condition, + gradient_map, + stepsize): + + (self.state, + self.gradient_map, + self.stepsize) = (np.copy(initial_condition), + gradient_map, + stepsize) + self._shape = self.state.shape[0] + self._sqrt_step = np.sqrt(self.stepsize) + self._noise = ndist(loc=0,scale=1) + self.sample = np.copy(initial_condition) + + def __iter__(self): + return self + + def next(self): + while True: + grad_posterior = self.gradient_map(self.state) + candidate = (self.state + self.stepsize * grad_posterior[0] + + np.sqrt(2.)* self._noise.rvs(self._shape) * self._sqrt_step) + + if not np.all(np.isfinite(self.gradient_map(candidate)[0])): + self.stepsize *= 0.5 + self._sqrt_step = np.sqrt(self.stepsize) + else: + self.state[:] = candidate + break diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index b697afd85..eae063fcc 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -291,7 +291,7 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): opt_offset, cond_precision) - self.cond_mean, self.cond_cov = cond_mean, cond_cov + self.cond_mean, self.cond_cov, self.logdens_linear = cond_mean, cond_cov, logdens_linear affine_con = constraints(A_scaling, b_scaling, diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 13dae3769..4ee6a5291 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -5,13 +5,13 @@ import regreg.api as rr -from ..lasso import lasso, selected_targets, full_targets, debiased_targets -from ...tests.instance import gaussian_instance -from ...tests.flags import SET_SEED -from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue -from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso -from ..randomization import randomization -from ...tests.decorators import rpy_test_safe +from selectinf.randomized.lasso import lasso, selected_targets, full_targets, debiased_targets +from selectinf.tests.instance import gaussian_instance +from selectinf.tests.flags import SET_SEED +from selectinf.tests.decorators import set_sampling_params_iftrue, set_seed_iftrue +from selectinf.algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso +from selectinf.randomized.randomization import randomization +from selectinf.tests.decorators import rpy_test_safe def test_highdim_lasso(n=500, p=200, diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py new file mode 100644 index 000000000..a0a7ffb10 --- /dev/null +++ b/selectinf/randomized/tests/test_posterior.py @@ -0,0 +1,70 @@ +import numpy as np +from selectinf.tests.instance import gaussian_instance +from selectinf.randomized.lasso import lasso, selected_targets +from selectinf.randomized.posterior_inference import posterior_inference_lasso + +def test_sampler(n=500, + p=100, + signal_fac=1., + s=5, + sigma=3., + rho=0.4, + randomizer_scale=1.): + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + posterior_inf = posterior_inference_lasso(observed_target, + cov_target, + cov_target_score, + conv.observed_opt_state, + conv.cond_mean, + conv.cond_cov, + conv.logdens_linear, + conv.A_scaling, + conv.b_scaling, + observed_target) + + samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, step=1.) + lci = np.percentile(samples, 5, axis=0) + uci = np.percentile(samples, 95, axis=0) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + + print("check ", coverage, length) + + +test_sampler() diff --git a/selectinf/sampling/api.py b/selectinf/sampling/api.py index edc376c5b..6d7c63c3a 100644 --- a/selectinf/sampling/api.py +++ b/selectinf/sampling/api.py @@ -1,4 +1,4 @@ from .langevin import projected_langevin -from .truncnorm import (sample_truncnorm_white, +from .truncnorm import (sample_truncnorm_white, sample_truncnorm_white_sphere, sample_truncnorm_white_ball) From 0f5e4d6f020bb64ba70307e577af4e0fc36f205f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 16 Apr 2020 12:41:03 -0700 Subject: [PATCH 028/187] reverting imports to original form, attributes can be found in other parts of lasso object for posterior --- selectinf/algorithms/api.py | 8 ++++---- selectinf/algorithms/sqrt_lasso.py | 6 +++--- selectinf/constraints/affine.py | 12 ++++++------ selectinf/randomized/lasso.py | 13 +++++-------- selectinf/randomized/query.py | 2 +- selectinf/randomized/tests/test_lasso.py | 14 +++++++------- selectinf/randomized/tests/test_posterior.py | 10 +++++++--- selectinf/sampling/api.py | 2 +- 8 files changed, 34 insertions(+), 33 deletions(-) diff --git a/selectinf/algorithms/api.py b/selectinf/algorithms/api.py index 786bb2f5e..f15caa897 100644 --- a/selectinf/algorithms/api.py +++ b/selectinf/algorithms/api.py @@ -1,13 +1,13 @@ -from .lasso import (lasso, +from .lasso import (lasso, ROSI, data_carving as data_carving_lasso, additive_noise as additive_noise_lasso) -from .sqrt_lasso import (choose_lambda as choose_lambda_sqrt_lasso, +from .sqrt_lasso import (choose_lambda as choose_lambda_sqrt_lasso, solve_sqrt_lasso) -from .forward_step import (forward_step, +from .forward_step import (forward_step, info_crit_stop) -from .covtest import (covtest, +from .covtest import (covtest, selected_covtest) diff --git a/selectinf/algorithms/sqrt_lasso.py b/selectinf/algorithms/sqrt_lasso.py index 36512c082..8bb03c5ef 100644 --- a/selectinf/algorithms/sqrt_lasso.py +++ b/selectinf/algorithms/sqrt_lasso.py @@ -15,10 +15,10 @@ from regreg.smooth.glm import gaussian_loglike from regreg.affine import astransform -from selectinf.constraints.affine import (constraints as affine_constraints, +from ..constraints.affine import (constraints as affine_constraints, sample_from_sphere) -from selectinf.distributions.discrete_multiparameter import multiparameter_family -from selectinf.distributions.discrete_family import discrete_family +from ..distributions.discrete_multiparameter import multiparameter_family +from ..distributions.discrete_family import discrete_family class sqlasso_objective(rr.smooth_atom): """ diff --git a/selectinf/constraints/affine.py b/selectinf/constraints/affine.py index 8be98688b..da4f30817 100644 --- a/selectinf/constraints/affine.py +++ b/selectinf/constraints/affine.py @@ -17,18 +17,18 @@ import numpy as np -from selectinf.distributions.pvalue import truncnorm_cdf, norm_interval -from selectinf.truncated.gaussian import truncated_gaussian, truncated_gaussian_old -from selectinf.sampling.api import (sample_truncnorm_white, +from ..distributions.pvalue import truncnorm_cdf, norm_interval +from ..truncated.gaussian import truncated_gaussian, truncated_gaussian_old +from ..sampling.api import (sample_truncnorm_white, sample_truncnorm_white_sphere, sample_truncnorm_white_ball) -from selectinf.distributions.chain import (reversible_markov_chain, +from ..distributions.chain import (reversible_markov_chain, parallel_test, serial_test) -from selectinf.constraints.estimation import optimal_tilt +from .estimation import optimal_tilt -from selectinf.distributions.discrete_family import discrete_family +from ..distributions.discrete_family import discrete_family from mpmath import mp WARNINGS = False diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index c9a5ec466..9c73512ca 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -7,13 +7,13 @@ import regreg.api as rr -from selectinf.algorithms.sqrt_lasso import solve_sqrt_lasso, choose_lambda +from ..algorithms.sqrt_lasso import solve_sqrt_lasso, choose_lambda -from selectinf.randomized.query import gaussian_query +from .query import gaussian_query -from selectinf.randomized.randomization import randomization -from selectinf.base import restricted_estimator -from selectinf.algorithms.debiased_lasso import (debiasing_matrix, +from .randomization import randomization +from ..base import restricted_estimator +from ..algorithms.debiased_lasso import (debiasing_matrix, pseudoinverse_debiasing_matrix) #### High dimensional version @@ -224,9 +224,6 @@ def signed_basis_vector(p, j, s): if num_opt_var > 0: self._setup_sampler(*self._setup_sampler_data) - self.A_scaling = A_scaling - self.b_scaling = b_scaling - return active_signs def _solve_randomized_problem(self, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index eae063fcc..b697afd85 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -291,7 +291,7 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): opt_offset, cond_precision) - self.cond_mean, self.cond_cov, self.logdens_linear = cond_mean, cond_cov, logdens_linear + self.cond_mean, self.cond_cov = cond_mean, cond_cov affine_con = constraints(A_scaling, b_scaling, diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 4ee6a5291..13dae3769 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -5,13 +5,13 @@ import regreg.api as rr -from selectinf.randomized.lasso import lasso, selected_targets, full_targets, debiased_targets -from selectinf.tests.instance import gaussian_instance -from selectinf.tests.flags import SET_SEED -from selectinf.tests.decorators import set_sampling_params_iftrue, set_seed_iftrue -from selectinf.algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso -from selectinf.randomized.randomization import randomization -from selectinf.tests.decorators import rpy_test_safe +from ..lasso import lasso, selected_targets, full_targets, debiased_targets +from ...tests.instance import gaussian_instance +from ...tests.flags import SET_SEED +from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue +from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso +from ..randomization import randomization +from ...tests.decorators import rpy_test_safe def test_highdim_lasso(n=500, p=200, diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index a0a7ffb10..8a028d3a7 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -47,15 +47,19 @@ def test_sampler(n=500, nonzero, dispersion=dispersion) + A_scaling = conv.sampler.affine_con.linear_part + b_scaling = conv.sampler.affine_con.offset + logdens_linear = conv.sampler.logdens_transform[0] + posterior_inf = posterior_inference_lasso(observed_target, cov_target, cov_target_score, conv.observed_opt_state, conv.cond_mean, conv.cond_cov, - conv.logdens_linear, - conv.A_scaling, - conv.b_scaling, + logdens_linear, + A_scaling, + b_scaling, observed_target) samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, step=1.) diff --git a/selectinf/sampling/api.py b/selectinf/sampling/api.py index 6d7c63c3a..edc376c5b 100644 --- a/selectinf/sampling/api.py +++ b/selectinf/sampling/api.py @@ -1,4 +1,4 @@ from .langevin import projected_langevin -from .truncnorm import (sample_truncnorm_white, +from .truncnorm import (sample_truncnorm_white, sample_truncnorm_white_sphere, sample_truncnorm_white_ball) From da2fcb36582dc7c4a579815789e34183f424b522 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sat, 18 Apr 2020 17:36:52 -0400 Subject: [PATCH 029/187] corrected prior --- selectinf/randomized/posterior_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index fa4f2bd1d..a4d0a89e6 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -60,7 +60,7 @@ def set_marginal_parameters(self): def prior(self, target_parameter, prior_var=100.): grad_prior = -target_parameter/prior_var - log_prior = -np.linalg.norm(target_parameter)/(2.*prior_var) + log_prior = -np.linalg.norm(target_parameter)**2 /(2.*prior_var) return grad_prior, log_prior def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): @@ -83,7 +83,7 @@ def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): log_lik = -((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2.\ - log_normalizer - grad_lik = self.prec_target.dot(self.observed_target) - self.prec_target.dot(target_parameter) + \ + grad_lik = self.prec_target.dot(self.observed_target) - self.prec_target.dot(target_parameter) \ -self.linear_coef.T.dot(prec_marginal.dot(soln)- conjugate_marginal) grad_prior, log_prior = self.prior(target_parameter) From f2c51ebb94fa93d2a64433425d8a6c6b905e72b7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 21 Apr 2020 15:53:22 -0700 Subject: [PATCH 030/187] able to use degenerate gaussian randomization for e.g. followup LASSO --- selectinf/randomized/lasso.py | 10 +- selectinf/randomized/posterior_inference.py | 6 +- selectinf/randomized/query.py | 119 +++++++---- selectinf/randomized/randomization.py | 38 ++++ selectinf/randomized/screening.py | 11 +- selectinf/randomized/tests/test_BH.py | 79 ++++---- .../randomized/tests/test_drop_losers.py | 186 ++++++++++++++++++ selectinf/randomized/tests/test_lasso.py | 49 ++--- .../tests/test_marginal_screening.py | 40 ++-- .../randomized/tests/test_multiple_queries.py | 19 +- selectinf/randomized/tests/test_posterior.py | 2 - .../tests/test_selective_MLE_high.py | 19 +- .../tests/test_selective_MLE_onedim.py | 22 ++- selectinf/randomized/tests/test_slope.py | 71 ++++--- .../randomized/tests/test_split_lasso.py | 27 +-- selectinf/randomized/tests/test_topK.py | 25 +-- 16 files changed, 532 insertions(+), 191 deletions(-) create mode 100644 selectinf/randomized/tests/test_drop_losers.py diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 9c73512ca..38d90ed04 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -310,7 +310,10 @@ def gaussian(X, """ - loglike = rr.glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic) + loglike = rr.glm.gaussian(X, + Y, + coef=1. / sigma ** 2, + quadratic=quadratic) n, p = X.shape mean_diag = np.mean((X ** 2).sum(0)) @@ -324,7 +327,8 @@ def gaussian(X, return lasso(loglike, np.asarray(feature_weights) / sigma ** 2, - ridge_term, randomizer) + ridge_term, + randomizer) @staticmethod def logistic(X, @@ -1001,8 +1005,6 @@ def gaussian(X, quadratic=quadratic) n, p = X.shape - mean_diag = np.mean((X ** 2).sum(0)) - return split_lasso(loglike, np.asarray(feature_weights) / sigma ** 2, proportion) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index fa4f2bd1d..99b8be9e2 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -1,7 +1,8 @@ from __future__ import division, print_function -import numpy as np, sys -from selectinf.randomized.selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +import numpy as np + +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from scipy.stats import norm as ndist class posterior_inference_lasso(): @@ -99,7 +100,6 @@ def posterior_sampler(self, nsample= 2000, nburnin=100, step=1.): for i in range(nsample): sampler.next() - sys.stderr.write("sample number: " + str(i) + "sample: " + str(sampler.state.copy())+ "\n") samples[i, :] = sampler.state.copy() return samples[nburnin:, :] diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index b697afd85..06396878e 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -2,6 +2,7 @@ from itertools import product import numpy as np +import pandas as pd from scipy.stats import norm as ndist from scipy.optimize import bisect @@ -75,7 +76,10 @@ def randomize(self, perturb=None): """ if not self._randomized: - self.randomized_loss, self._initial_omega = self.randomization.randomize(self.loss, self.epsilon, perturb=perturb) + (self.randomized_loss, + self._initial_omega) = self.randomization.randomize(self.loss, + self.epsilon, + perturb=perturb) self._randomized = True def get_sampler(self): @@ -170,10 +174,6 @@ def summary(self, normal_sample=target_sample, alternatives=alternatives) - MLE_intervals = self.selective_MLE(observed_target, - target_cov, - target_score_cov)[5] - if not np.all(parameter == 0): pvalues = self.sampler.coefficient_pvalues(observed_target, target_cov, @@ -185,22 +185,34 @@ def summary(self, else: pvalues = pivots - intervals = None + result = pd.DataFrame({'target':observed_target, + 'pvalue':pvalues}) + if compute_intervals: - MLE_intervals = self.selective_MLE(observed_target, - target_cov, - target_score_cov)[4] + MLE = query.selective_MLE(self, + observed_target, + target_cov, + target_score_cov)[0] + MLE_intervals = np.asarray(MLE[['lower', 'upper']]) + + intervals = self.sampler.confidence_intervals( + observed_target, + target_cov, + target_score_cov, + sample=(opt_sample, logW), + normal_sample=target_sample, + initial_guess=MLE_intervals, + level=level) + + result.insert(2, 'lower', intervals[:,0]) + result.insert(3, 'upper', intervals[:,1]) - intervals = self.sampler.confidence_intervals(observed_target, - target_cov, - target_score_cov, - sample=(opt_sample, logW), - normal_sample=target_sample, - initial_guess=MLE_intervals, - level=level) + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) - return pivots, pvalues, intervals + return result def selective_MLE(self, observed_target, @@ -260,15 +272,16 @@ def fit(self, perturb=None): # Private methods def _setup_sampler(self, - A_scaling, - b_scaling, + linear_part, + offset, opt_linear, opt_offset, # optional dispersion parameter # for covariance of randomization dispersion=1): - if not np.all(A_scaling.dot(self.observed_opt_state) - b_scaling <= 0): + A, b = linear_part, offset + if not np.all(A.dot(self.observed_opt_state) - b <= 0): raise ValueError('constraints not satisfied') (cond_mean, @@ -293,8 +306,8 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): self.cond_mean, self.cond_cov = cond_mean, cond_cov - affine_con = constraints(A_scaling, - b_scaling, + affine_con = constraints(A, + b, mean=cond_mean, covariance=cond_cov) @@ -441,7 +454,7 @@ def summary(self, if not np.all(parameter == 0): pvalues = self.coefficient_pvalues(observed_target, - parameter=parameter, + parameter=np.zeros_like(observed_target), alternatives=alternatives) else: pvalues = pivots @@ -451,8 +464,16 @@ def summary(self, intervals = self.confidence_intervals(observed_target, level) - return pivots, pvalues, intervals - + result = pd.DataFrame({'target':observed_target, + 'pvalue':pvalues, + 'lower':intervals[:,0], + 'upper':intervals[:,1]}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result def coefficient_pvalues(self, observed_target, @@ -1275,7 +1296,6 @@ def _rootL(gamma): delta *= 2 count += 1 lower = bisect(_rootL, Ll, Ul) - return lower + observed_stat, upper + observed_stat # Private methods @@ -1501,9 +1521,10 @@ def _solve_barrier_nonneg(conjugate_arg, def selective_MLE(observed_target, target_cov, target_score_cov, - init_soln, # initial (observed) value of optimization variables -- - # used as a feasible point. - # precise value used only for independent estimator + init_soln, # initial (observed) value of + # optimization variables -- used as a + # feasible point. precise value used + # only for independent estimator cond_mean, cond_cov, logdens_linear, @@ -1601,11 +1622,19 @@ def selective_MLE(observed_target, alpha = 1 - level quantile = ndist.ppf(1 - alpha / 2.) - intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), - final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - - return final_estimator, observed_info_mean, Z_scores, pvalues, intervals, ind_unbiased_estimator - + intervals = np.vstack([final_estimator - + quantile * np.sqrt(np.diag(observed_info_mean)), + final_estimator + + quantile * np.sqrt(np.diag(observed_info_mean))]).T + + result = pd.DataFrame({'MLE':final_estimator, + 'SE':np.sqrt(np.diag(observed_info_mean)), + 'Zvalue':Z_scores, + 'pvalue':pvalues, + 'lower':intervals[:,0], + 'upper':intervals[:,1], + 'unbiased':ind_unbiased_estimator}) + return result, observed_info_mean def normalizing_constant(target_parameter, observed_target, @@ -1717,3 +1746,25 @@ def normalizing_constant(target_parameter, soln[:ntarget], hess[:ntarget][:,:ntarget]) + +def _bisect(f, lb, ub, min_iter=20, max_iter=100, tol=1.e-3): + + while True: + sign_l = np.sign(f(lb)) + sign_u = np.sign(f(ub)) + mid = 0.5 * (lb + ub) + f_mid = f(mid) + if sign_l == 1: + if f_mid > 0: # we should move closer to upper + lb = mid + else: + ub = mid + else: + if f_mid > 0: # we should move closer to lower + ub = mid + else: + lb = mid + + if np.fabs(f_mid) < tol: + break + return mid diff --git a/selectinf/randomized/randomization.py b/selectinf/randomized/randomization.py index f7dd4e10b..54437990a 100644 --- a/selectinf/randomized/randomization.py +++ b/selectinf/randomized/randomization.py @@ -168,6 +168,44 @@ def gaussian(covariance): log_density = lambda x: -np.sum(sqrt_precision.dot(np.atleast_2d(x).T)**2, 0) * 0.5 - np.log(_const), cov_prec=(covariance, precision)) + @staticmethod + def degenerate_gaussian(covariance, tol=1.e-6): + """ + Gaussian noise with a given covariance. + Parameters + ---------- + covariance : np.float((*,*)) + Positive definite covariance matrix. Non-negative definite + will raise an error. + """ + p = covariance.shape[0] + U, D, _ = np.linalg.svd(covariance) + keep = D > D.max() * tol + rank = keep.sum() + sqrt_cov = U[:,keep].dot(np.diag(np.sqrt(D[keep]))) + sqrt_precision = U[:,keep].dot(np.diag(1./np.sqrt(D[keep]))) + precision = sqrt_precision.dot(sqrt_precision.T) + _const = 1. + density = lambda x: np.exp(-(x * precision.dot(x)).sum() / 2) / _const + cdf = lambda x: None + pdf = lambda x: None + derivative_log_density = lambda x: None + grad_negative_log_density = lambda x: precision.dot(x) + sampler = lambda size: covariance.dot(sqrt_precision.dot(np.random.standard_normal((rank,) + size))) + + return randomization((p,), + density, + cdf, + pdf, + derivative_log_density, + grad_negative_log_density, + sampler, + lipschitz=(1/D[keep]).max(), + log_density = lambda x: -np.sum(sqrt_precision.T.dot(np.atleast_2d(x).T)**2, 0) * 0.5 - np.log(_const), + cov_prec=(covariance, precision)) + + + @staticmethod def laplace(shape, scale): """ diff --git a/selectinf/randomized/screening.py b/selectinf/randomized/screening.py index 0aab6d341..b87ae0027 100644 --- a/selectinf/randomized/screening.py +++ b/selectinf/randomized/screening.py @@ -39,7 +39,10 @@ def multivariate_targets(self, features, dispersion=1.): crosscov_target_score = -score_linear.dot(cov_target) alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, alternatives + return (observed_target, + cov_target * dispersion, + crosscov_target_score.T * dispersion, + alternatives) def full_targets(self, features, dispersion=1.): """ @@ -104,7 +107,7 @@ def fit(self, perturb=None): self.num_opt_var = self.observed_opt_state.shape[0] opt_linear = np.zeros((p, self.num_opt_var)) - opt_linear[self._selected,:] = np.diag(active_signs) + opt_linear[self._selected] = np.diag(active_signs) opt_offset = np.zeros(p) opt_offset[self._selected] = active_signs * self.threshold[self._selected] opt_offset[self._not_selected] = _randomized_score[self._not_selected] @@ -324,7 +327,7 @@ def fit(self, perturb=None): self.num_opt_var = self.observed_opt_state.shape[0] opt_linear = np.zeros((p, self.num_opt_var)) - opt_linear[self._selected,:] = np.diag(topK_signs) + opt_linear[self._selected] = np.diag(topK_signs) opt_offset = np.zeros(p) else: @@ -342,7 +345,7 @@ def fit(self, perturb=None): self.num_opt_var = self.observed_opt_state.shape[0] opt_linear = np.zeros((p, self.num_opt_var)) - opt_linear[self._selected,:] = np.identity(self.num_opt_var) + opt_linear[self._selected] = np.identity(self.num_opt_var) opt_offset = np.zeros(p) # in both cases, this conditioning means we just need to compute diff --git a/selectinf/randomized/tests/test_BH.py b/selectinf/randomized/tests/test_BH.py index 07192cfcb..e581c6350 100644 --- a/selectinf/randomized/tests/test_BH.py +++ b/selectinf/randomized/tests/test_BH.py @@ -53,31 +53,36 @@ def test_independent_estimator(n=100, n1=50, q=0.2, signal=3, p=100): perturb = Zbar1 - Zbar frac = n1 * 1. / n - BH_select = stepup.BH(Zbar, np.identity(p) / n, np.sqrt((1 - frac) / (n * frac)), q=q) + BH_select = stepup.BH(Zbar, np.identity(p) / n, + np.sqrt((1 - frac) / (n * frac)), q=q) selected = BH_select.fit(perturb=perturb) observed_target = Zbar[selected] cov_target = np.identity(selected.sum()) / n cross_cov = -np.identity(p)[selected] / n - observed_target1, cov_target1, cross_cov1, _ = BH_select.marginal_targets(selected) - - assert(np.linalg.norm(observed_target - observed_target1) / np.linalg.norm(observed_target) < 1.e-7) - assert(np.linalg.norm(cov_target - cov_target1) / np.linalg.norm(cov_target) < 1.e-7) - assert(np.linalg.norm(cross_cov - cross_cov1) / np.linalg.norm(cross_cov) < 1.e-7) - - (final_estimator, - _, - Z_scores, - pvalues, - intervals, - ind_unbiased_estimator) = BH_select.selective_MLE(observed_target, cov_target, cross_cov) - + (observed_target1, + cov_target1, + cross_cov1, + _) = BH_select.marginal_targets(selected) + + assert(np.linalg.norm(observed_target - observed_target1) / + np.linalg.norm(observed_target) < 1.e-7) + assert(np.linalg.norm(cov_target - cov_target1) / + np.linalg.norm(cov_target) < 1.e-7) + assert(np.linalg.norm(cross_cov - cross_cov1) / np.linalg.norm(cross_cov) + < 1.e-7) + + result = BH_select.selective_MLE(observed_target, cov_target, cross_cov)[0] + Z = result['Zvalue'] + ind_unbiased_estimator = result['unbiased'] Zbar2 = Z[n1:].mean(0)[selected] - assert(np.linalg.norm(ind_unbiased_estimator - Zbar2) / np.linalg.norm(Zbar2) < 1.e-6) + assert(np.linalg.norm(ind_unbiased_estimator - Zbar2) + / np.linalg.norm(Zbar2) < 1.e-6) np.testing.assert_allclose(sorted(np.nonzero(selected)[0]), - sorted(BHfilter(2 * ndist.sf(np.fabs(np.sqrt(n1) * Zbar1))))) + sorted(BHfilter(2 * ndist.sf(np.fabs( + np.sqrt(n1) * Zbar1))))) def test_BH(n=500, @@ -133,28 +138,33 @@ def test_BH(n=500, if use_MLE: print('huh') - estimate, info, _, pval, intervals, _ = BH_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score, - level=level) - pivots = ndist.cdf((estimate - beta_target) / np.sqrt(np.diag(info))) + result = BH_select.selective_MLE(observed_target, + cov_target, + crosscov_target_score, + level=level)[0] + estimate = result['MLE'] + pivots = ndist.cdf((estimate - beta_target) / result['SE']) pivots = 2 * np.minimum(pivots, 1 - pivots) # run summary else: - pivots, pval, intervals = BH_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, - compute_intervals=True, - level=level, - ndraw=20000, - burnin=2000, - parameter=beta_target) + result = BH_select.summary(observed_target, + cov_target, + crosscov_target_score, + alternatives, + compute_intervals=True, + level=level, + ndraw=20000, + burnin=2000, + parameter=beta_target) + pivots = np.asarray(result['pivot']) + pval = np.asarray(result['pvalue']) + lower = np.asarray(result['lower']) + upper = np.asarray(result['upper']) print(pval) - print("beta_target and intervals", beta_target, intervals) - coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + print("beta_target and intervals", beta_target, result[['lower', 'upper']]) + coverage = (beta_target > lower) * (beta_target < upper) print("coverage for selected target", coverage.sum()/float(nonzero.sum())) - return pivots[beta_target == 0], pivots[beta_target != 0], coverage, intervals, pivots + return pivots[beta_target == 0], pivots[beta_target != 0], coverage, result[['lower', 'upper']], pivots else: return [], [], [], [], [] @@ -170,7 +180,8 @@ def main(nsim=500, use_MLE=True, marginal=False): P0, PA, cover, length_int = [], [], [], [] Ps = [] for i in range(nsim): - p0, pA, cover_, intervals, pivots = test_BH(use_MLE=use_MLE, marginal=marginal) + p0, pA, cover_, intervals, pivots = test_BH(use_MLE=use_MLE, + marginal=marginal) Ps.extend(pivots) cover.extend(cover_) P0.extend(p0) diff --git a/selectinf/randomized/tests/test_drop_losers.py b/selectinf/randomized/tests/test_drop_losers.py new file mode 100644 index 000000000..4d78d8afc --- /dev/null +++ b/selectinf/randomized/tests/test_drop_losers.py @@ -0,0 +1,186 @@ +import numpy as np, pandas as pd + +from ..drop_losers import drop_losers +from ..screening import topK +from ..randomization import randomization + +def test_drop_losers(p=50, + K=5, + n=300, + use_MLE=True): + + arm = [] + data = [] + stage = [] + for a in range(p): + N = int(np.random.poisson(n, size=(1,))) + arm.extend([a]*N) + stage.extend([1]*N) + data.extend(list(np.random.standard_normal(N))) + + df = pd.DataFrame({'arm':arm, + 'stage':stage, + 'data':data}) + + grouped = df.groupby('arm') + stage1_means = df.groupby('arm').mean().sort_values('data', ascending=False) + winners = list(stage1_means.index[:K]) + + for winner in winners: + N = int(np.random.poisson(30, size=(1,))) + arm.extend([winner]*N) + stage.extend([2]*N) + data.extend(list(np.random.standard_normal(N))) + + df = pd.DataFrame({'arm':arm, + 'stage':stage, + 'data':data}) + + dtl = drop_losers(df, + K=K) + + dtl.selective_MLE() + if not use_MLE: + result = dtl.summary(ndraw=20000, burnin=5000) + else: + result = dtl.selective_MLE()[0] + pvalue = np.asarray(result['pvalue']) + lower = np.asarray(result['lower']) + upper = np.asarray(result['upper']) + cover = (lower < 0) * (upper > 0) + + return pvalue, cover + +def test_compare_topK(p=20, + K=5, + n=100): + + arm = [] + data = [] + stage = [] + for a in range(p): + N = int(np.random.poisson(n, size=(1,))) + arm.extend([a]*N) + stage.extend([1]*N) + data.extend(list(np.random.standard_normal(N))) + + df1 = pd.DataFrame({'arm':arm, + 'stage':stage, + 'data':data}) + + grouped = df1.groupby('arm') + stage1_means = df1.groupby('arm').mean().sort_values('data', ascending=False) + winners = list(stage1_means.index[:K]) + + for winner in winners: + N = int(np.random.poisson(30, size=(1,))) + arm.extend([winner]*N) + stage.extend([2]*N) + data.extend(list(np.random.standard_normal(N))) + + df2 = pd.DataFrame({'arm':arm, + 'stage':stage, + 'data':data}) + + dtl = drop_losers(df2, + K=K) + + # need additional data for randomized api with non-degenerate covariance + + for a in range(p): + if a not in winners: + N = int(np.random.poisson(30, size=(1,))) + arm.extend([a]*N) + stage.extend([2]*N) + data.extend(list(np.random.standard_normal(N))) + + df_full = pd.DataFrame({'arm':arm, + 'stage':stage, + 'data':data}) + full_means = df_full.groupby('arm').mean()['data'].iloc[range(p)] + full_std = df_full.groupby('arm').std()['data'].iloc[range(p)] + n_1 = df1.groupby('arm').count()['data'].iloc[range(p)] + n_full = df_full.groupby('arm').count()['data'].iloc[range(p)] + print(n_1, n_full) + stage1_means = df1.groupby('arm').mean()['data'].iloc[range(p)] + perturb = np.array(stage1_means) - np.array(full_means) + + covariance = np.diag(np.array(full_std)**2 / np.array(n_full)) + randomizer = randomization.gaussian(np.diag(np.array(full_std)**2 / np.array(n_1)) - + covariance) + + randomized_topK = topK(full_means, + covariance, + randomizer, + K, + perturb=perturb) + + randomized_topK.fit(perturb=perturb) + + (observed_target, + target_cov, + target_score_cov, + _) = randomized_topK.marginal_targets(randomized_topK.selection_variable['variables']) + + # try with a degenerate covariance now + + means2 = df2.groupby('arm').mean()['data'].iloc[range(p)] + std2 = df2.groupby('arm').std()['data'].iloc[range(p)] + n_2 = df2.groupby('arm').count()['data'].iloc[range(p)] + stage1_means = df1.groupby('arm').mean()['data'].iloc[range(p)] + perturb2 = np.array(stage1_means) - np.array(means2) + covariance2 = np.diag(np.array(std2)**2 / np.array(n_2)) + degenerate_randomizer = randomization.degenerate_gaussian( + np.diag(np.array(std2)**2 / + np.array(n_1)) - + covariance2) + + degenerate_topK = topK(means2, + covariance2, + degenerate_randomizer, + K, + perturb=perturb2) + + np.random.seed(0) + summary1 = randomized_topK.summary(observed_target, + target_cov, + target_score_cov, + alternatives=['twosided']*K, + ndraw=10000, + burnin=2000, + compute_intervals=True) + np.random.seed(0) + summary2 = dtl.summary(ndraw=10000, + burnin=2000) + + np.testing.assert_allclose(summary1['pvalue'], summary2['pvalue'], rtol=1.e-3) + np.testing.assert_allclose(summary1['target'], summary2['target'], rtol=1.e-3) + np.testing.assert_allclose(summary1['lower'], summary2['lower'], rtol=1.e-3) + np.testing.assert_allclose(summary1['upper'], summary2['upper'], rtol=1.e-3) + + np.random.seed(0) + degenerate_topK.fit(perturb=perturb2) + summary3 = degenerate_topK.summary(observed_target, + target_cov, + target_score_cov, + alternatives=['twosided']*K, + ndraw=10000, + burnin=2000, + compute_intervals=True) + + np.testing.assert_allclose(summary1['pvalue'], summary3['pvalue'], rtol=1.e-3) + np.testing.assert_allclose(summary1['target'], summary3['target'], rtol=1.e-3) + np.testing.assert_allclose(summary1['lower'], summary3['lower'], rtol=1.e-3) + np.testing.assert_allclose(summary1['upper'], summary3['upper'], rtol=1.e-3) + + +def main(nsim=100, use_MLE=True): + + P0, cover = [], [] + + for i in range(nsim): + p0, cover_ = test_drop_losers(use_MLE=use_MLE) + + cover.extend(cover_) + P0.extend(p0) + print('coverage', np.mean(cover)) diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 13dae3769..01b5b110a 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -78,14 +78,15 @@ def test_highdim_lasso(n=500, nonzero, penalty=conv.penalty) - _, pval, intervals = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - ndraw=ndraw, - burnin=burnin, - compute_intervals=True) - + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + ndraw=ndraw, + burnin=burnin, + compute_intervals=True) + pval = result['pvalue'] + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] def test_AR_randomization(n=300, @@ -165,14 +166,15 @@ def test_AR_randomization(n=300, nonzero, penalty=conv.penalty) - _, pval, intervals = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - ndraw=ndraw, - burnin=burnin, - compute_intervals=True) - + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + ndraw=ndraw, + burnin=burnin, + compute_intervals=True) + pval = result['pvalue'] + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] def test_all_targets(n=100, p=20, signal_fac=1.5, s=5, sigma=3, rho=0.4): @@ -262,13 +264,14 @@ def test_sqrt_highdim_lasso(n=500, conv._W, nonzero) - _, pval, intervals = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - ndraw=ndraw, - burnin=burnin, - compute_intervals=False) + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + ndraw=ndraw, + burnin=burnin, + compute_intervals=False) + pval = result['pvalue'] return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] diff --git a/selectinf/randomized/tests/test_marginal_screening.py b/selectinf/randomized/tests/test_marginal_screening.py index 6e7a564e0..e416cdade 100644 --- a/selectinf/randomized/tests/test_marginal_screening.py +++ b/selectinf/randomized/tests/test_marginal_screening.py @@ -57,17 +57,19 @@ def test_marginal(n=500, alternatives) = marginal_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: - estimate, _, _, pval, intervals, _ = marginal_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score) + result = marginal_select.selective_MLE(observed_target, + cov_target, + crosscov_target_score)[0] # run summary else: - _, pval, intervals = marginal_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, - compute_intervals=True) - + result = marginal_select.summary(observed_target, + cov_target, + crosscov_target_score, + alternatives, + compute_intervals=True) + + intervals = np.asarray(result[['lower', 'upper']]) + pval = result['pvalue'] print(pval) if marginal: beta_target = true_mean[nonzero] @@ -138,17 +140,19 @@ def test_simple(n=100, alternatives) = marginal_select.marginal_targets(nonzero) if use_MLE: - estimate, _, _, pval, intervals, _ = marginal_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score) + result = marginal_select.selective_MLE(observed_target, + cov_target, + crosscov_target_score) # run summary else: - _, pval, intervals = marginal_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, - compute_intervals=True) - + result = marginal_select.summary(observed_target, + cov_target, + crosscov_target_score, + alternatives, + compute_intervals=True) + + pval = result['pvalue'] + intervals = np.asarray(result[['lower', 'upper']]) print(pval) beta_target = cov_target.dot(true_mean[nonzero]) print("beta_target and intervals", beta_target, intervals) diff --git a/selectinf/randomized/tests/test_multiple_queries.py b/selectinf/randomized/tests/test_multiple_queries.py index 03a921862..38c069f9e 100644 --- a/selectinf/randomized/tests/test_multiple_queries.py +++ b/selectinf/randomized/tests/test_multiple_queries.py @@ -12,7 +12,15 @@ from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso # the test here is marginal_screening + lasso -def test_multiple_queries(n=500, p=100, signal_fac=1.5, s=5, sigma=3, rho=0.4, randomizer_scale=1, ndraw=5000, burnin=1000): +def test_multiple_queries(n=500, + p=100, + signal_fac=1.5, + s=5, + sigma=3, + rho=0.4, + randomizer_scale=1, + ndraw=5000, + burnin=1000): inst, const1, const2 = gaussian_instance, marginal_screening, lasso.gaussian signal = np.sqrt(signal_fac * np.log(p)) @@ -63,10 +71,11 @@ def test_multiple_queries(n=500, p=100, signal_fac=1.5, s=5, sigma=3, rho=0.4, r mq = multiple_queries([conv1, conv2]) - _, pval, intervals = mq.summary(observed_target1, - [(cov_target1, cov_target_score1), (cov_target2, cov_target_score2)], - compute_intervals=True) - + results = mq.summary(observed_target1, + [(cov_target1, cov_target_score1), + (cov_target2, cov_target_score2)], + compute_intervals=True) + pval = np.asarray(results['pvalue']) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 8a028d3a7..ba74d5a72 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -70,5 +70,3 @@ def test_sampler(n=500, print("check ", coverage, length) - -test_sampler() diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 4a4d4a8a5..a8912718b 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -69,10 +69,12 @@ def test_full_targets(n=200, penalty=conv.penalty, dispersion=dispersion) - estimate, _, _, pval, intervals, _ = conv.selective_MLE(observed_target, - cov_target, - cov_target_score) - + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + pval = result['pvalue'] + estimate = result['MLE'] + intervals = np.asarray(result[['lower', 'upper']]) print("estimate, intervals", estimate, intervals) coverage = (beta[nonzero] > intervals[:, 0]) * (beta[nonzero] < intervals[:, 1]) @@ -134,9 +136,12 @@ def test_selected_targets(n=2000, nonzero, dispersion=dispersion) - estimate, _, _, pval, intervals, _ = conv.selective_MLE(observed_target, - cov_target, - cov_target_score) + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + pval = result['pvalue'] + estimate = result['MLE'] + intervals = np.asarray(result[['lower', 'upper']]) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) diff --git a/selectinf/randomized/tests/test_selective_MLE_onedim.py b/selectinf/randomized/tests/test_selective_MLE_onedim.py index df3aea08d..9587991da 100644 --- a/selectinf/randomized/tests/test_selective_MLE_onedim.py +++ b/selectinf/randomized/tests/test_selective_MLE_onedim.py @@ -35,17 +35,27 @@ def test_onedim_lasso(n=50000, W=1.5, signal=2., sigma=1, randomizer_scale=1): conv._W, nonzero) - estimate_cur, I_cur, Z_cur, pv_cur = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[:4] + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score) + estimate_cur = float(result[0]['MLE']) + Z_cur = float(result[0]['Zvalue']) + pv_cur = float(result[0]['pvalue']) + I_cur = result[1] # this matches exactly with old code target_Z = X.T.dot(Y) / np.sqrt((X**2).sum(0)) - estimate, I, Z, pv = conv.sampler.selective_MLE(target_Z, sigma**2 * np.ones((1,1)), - -sigma**2 * np.ones((1,1)), np.ones((1,)), - solve_args={'tol':1.e-12})[:4] + result2 = conv.sampler.selective_MLE(target_Z, + sigma**2 * np.ones((1,1)), + -sigma**2 * np.ones((1,1)), + np.ones((1,)), + solve_args={'tol':1.e-12}) + estimate, I, Z, pv = (float(result2[0]['MLE']), + result2[1], + float(result2[0]['Zvalue']), + float(result2[0]['pvalue'])) target_transform = (-np.identity(1), np.zeros(1)) s = signs diff --git a/selectinf/randomized/tests/test_slope.py b/selectinf/randomized/tests/test_slope.py index 05d2ec257..5c31a848f 100644 --- a/selectinf/randomized/tests/test_slope.py +++ b/selectinf/randomized/tests/test_slope.py @@ -1,12 +1,12 @@ from ...tests.instance import gaussian_instance -import numpy as np +import numpy as np, pandas as pd from regreg.atoms.slope import slope as slope_atom import regreg.api as rr from ..slope import slope -from ..lasso import full_targets +from ..lasso import full_targets, selected_targets from ...tests.decorators import rpy_test_safe try: @@ -34,7 +34,7 @@ def slope_R(X, Y, W = None, normalize = True, choice_weights = "gaussian", sigma { if(choice_weights == "gaussian"){ lambda = "gaussian"} else{ - lambda = "bhq"} + lambda = "bh"} result = SLOPE(X, Y, fdr = fdr, lambda = lambda, normalize = normalize, sigma = sigma) } else{ result = SLOPE(X, Y, fdr = fdr, lambda = W, normalize = normalize, sigma = sigma) @@ -57,8 +57,8 @@ def slope_R(X, Y, W = None, normalize = True, choice_weights = "gaussian", sigma r_W = robjects.NA_Logical if choice_weights is "gaussian": r_choice_weights = robjects.StrVector('gaussian') - elif choice_weights is "bhq": - r_choice_weights = robjects.StrVector('bhq') + elif choice_weights is "bh": + r_choice_weights = robjects.StrVector('bh') else: r_W = robjects.r.matrix(W, nrow=p, ncol=1) @@ -69,12 +69,15 @@ def slope_R(X, Y, W = None, normalize = True, choice_weights = "gaussian", sigma result = r_slope(r_X, r_Y, r_W, r_normalize, r_choice_weights, r_sigma) - result = np.asarray(result.rx2('beta')), np.asarray(result.rx2('E')), \ - np.asarray(result.rx2('lambda_seq')), np.asscalar(np.array(result.rx2('sigma'))) + result = (np.asarray(result.rx2('beta')), + np.asarray(result.rx2('E')), + np.asarray(result.rx2('lambda_seq')).reshape(-1), + np.asscalar(np.array(result.rx2('sigma')))) rpy2.robjects.numpy2ri.deactivate() return result +@np.testing.dec.skipif(True, "extracting beta from SLOPE in R is troublesome here") @rpy_test_safe(libraries=['SLOPE']) def test_outputs_SLOPE_weights(n=500, p=100, signal_fac=1., s=5, sigma=3., rho=0.35): @@ -97,6 +100,7 @@ def test_outputs_SLOPE_weights(n=500, p=100, signal_fac=1., s=5, sigma=3., rho=0 normalize = True, choice_weights = "gaussian", sigma = sigma_) + print("estimated sigma", sigma_, r_sigma) print("weights output by R", r_lambda_seq) print("output of est coefs R", r_beta) @@ -108,11 +112,19 @@ def test_outputs_SLOPE_weights(n=500, p=100, signal_fac=1., s=5, sigma=3., rho=0 soln = problem.solve() print("output of est coefs python", soln) + print(r_beta, 'huh') print("relative difference in solns", np.linalg.norm(soln-r_beta)/np.linalg.norm(r_beta)) @rpy_test_safe(libraries=['SLOPE']) -def test_randomized_slope(n=500, p=100, signal_fac=1.2, s=5, sigma=1., rho=0.35, randomizer_scale= np.sqrt(0.25), - target = "full", use_MLE=True): +def test_randomized_slope(n=2000, + p=100, + signal_fac=1.5, + s=10, + sigma=1., + rho=0.35, + randomizer_scale=0.7, + target = "full", + use_MLE=True): while True: inst = gaussian_instance @@ -127,16 +139,10 @@ def test_randomized_slope(n=500, p=100, signal_fac=1.2, s=5, sigma=1., rho=0.35, random_signs=True)[:3] sigma_ = np.sqrt(np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p)) - r_beta, r_E, r_lambda_seq, r_sigma = slope_R(X, - Y, - W=None, - normalize=True, - choice_weights="gaussian", #put gaussian - sigma=sigma_) conv = slope.gaussian(X, Y, - r_sigma * r_lambda_seq, + np.linspace(3, 1, p) * sigma_, randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() @@ -166,27 +172,36 @@ def test_randomized_slope(n=500, p=100, signal_fac=1.2, s=5, sigma=1., rho=0.35, beta_target = beta[nonzero] if use_MLE: - estimate, _, _, pval, intervals, _ = conv.selective_MLE(observed_target, - cov_target, - cov_target_score) + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] else: - _, pval, intervals = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - compute_intervals=True) - coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + compute_intervals=True, + ndraw=150000) + pval = np.asarray(result['pvalue']) + lower = np.asarray(result['lower']) + upper = np.asarray(result['upper']) + + print(pd.DataFrame({'target':beta_target, + 'lower':lower, + 'upper':upper})) + + coverage = (beta_target > lower) * (beta_target < upper) break if True: - return pval[beta_target == 0], pval[beta_target != 0], coverage, intervals + return pval[beta_target == 0], pval[beta_target != 0], coverage, lower, upper -def main(nsim=100): +def main(nsim=100, use_MLE=True): P0, PA, cover, length_int = [], [], [], [] for i in range(nsim): - p0, pA, cover_, intervals = test_randomized_slope() + p0, pA, cover_, _, _ = test_randomized_slope(use_MLE=use_MLE) cover.extend(cover_) P0.extend(p0) diff --git a/selectinf/randomized/tests/test_split_lasso.py b/selectinf/randomized/tests/test_split_lasso.py index 768903e3f..68c78cd8d 100644 --- a/selectinf/randomized/tests/test_split_lasso.py +++ b/selectinf/randomized/tests/test_split_lasso.py @@ -92,18 +92,21 @@ def test_split_lasso(n=100, penalty=conv.penalty, dispersion=sigma**2) - _, pval, intervals = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - ndraw=ndraw, - burnin=burnin, - compute_intervals=False) - - final_estimator, observed_info_mean = conv.selective_MLE( - observed_target, - cov_target, - cov_target_score)[:2] + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + ndraw=ndraw, + burnin=burnin, + compute_intervals=False) + + MLE_result, observed_info_mean = conv.selective_MLE( + observed_target, + cov_target, + cov_target_score) + + final_estimator = np.asarray(MLE_result['MLE']) + pval = np.asarray(result['pvalue']) if target == 'selected': true_target = np.linalg.pinv(X[:,nonzero]).dot(X.dot(beta)) diff --git a/selectinf/randomized/tests/test_topK.py b/selectinf/randomized/tests/test_topK.py index 77984d545..83c7a6ac0 100644 --- a/selectinf/randomized/tests/test_topK.py +++ b/selectinf/randomized/tests/test_topK.py @@ -57,24 +57,27 @@ def test_topK(n=500, alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: - estimate, _, _, pval, intervals, _ = topK_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score) + result = topK_select.selective_MLE(observed_target, + cov_target, + crosscov_target_score)[0] # run summary else: - _, pval, intervals = topK_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, - compute_intervals=True) - + result = topK_select.summary(observed_target, + cov_target, + crosscov_target_score, + alternatives, + compute_intervals=True) + lower = np.asarray(result['lower']) + upper = np.asarray(result['upper']) + pval = result['pvalue'] + intervals = np.asarray(result[['lower', 'upper']]) print(pval) if marginal: beta_target = true_mean[nonzero] else: beta_target = beta[nonzero] - print("beta_target and intervals", beta_target, intervals) - coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + print("beta_target and intervals", beta_target, lower, upper) + coverage = (beta_target > lower) * (beta_target < upper) print("coverage for selected target", coverage.sum()/float(nonzero.sum())) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals From 9854fe3cd6faf3bb0f0fc07fb1155854297d96d1 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sat, 2 May 2020 01:35:41 -0400 Subject: [PATCH 031/187] added a local scaling for the Langevin sampler --- selectinf/randomized/posterior_inference.py | 22 ++++++----- selectinf/randomized/query.py | 3 +- selectinf/randomized/tests/test_posterior.py | 40 +++++++++++++++++--- 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index a4d0a89e6..6d9c69a36 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -2,7 +2,8 @@ import numpy as np, sys from selectinf.randomized.selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C -from scipy.stats import norm as ndist +from scipy.stats import norm as ndist, invgamma +from scipy.linalg import fractional_matrix_power class posterior_inference_lasso(): @@ -34,7 +35,6 @@ def __init__(self, self.offset = offset self.initial_estimate = initial_estimate - self.set_marginal_parameters() def set_marginal_parameters(self): @@ -57,10 +57,11 @@ def set_marginal_parameters(self): self.cov_marginal = implied_cov[self.ntarget:, self.ntarget:] - def prior(self, target_parameter, prior_var=100.): + def prior(self, target_parameter, scale=1., prior_var=100.): + + grad_prior = -target_parameter/(scale* prior_var) + log_prior = -np.linalg.norm(target_parameter)**2 /(2.* scale * prior_var) - grad_prior = -target_parameter/prior_var - log_prior = -np.linalg.norm(target_parameter)**2 /(2.*prior_var) return grad_prior, log_prior def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): @@ -87,14 +88,15 @@ def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): -self.linear_coef.T.dot(prec_marginal.dot(soln)- conjugate_marginal) grad_prior, log_prior = self.prior(target_parameter) + return grad_lik + grad_prior, log_lik + log_prior - def posterior_sampler(self, nsample= 2000, nburnin=100, step=1.): + def posterior_sampler(self, nsample= 2000, nburnin=100, local_scale = np.identity, step=1.): state = self.initial_estimate stepsize = 1. / (step * self.ntarget) - sampler = langevin(state, self.log_posterior, stepsize) + sampler = langevin(state, self.log_posterior, local_scale, stepsize) samples = np.zeros((nsample, self.ntarget)) for i in range(nsample): @@ -108,6 +110,7 @@ class langevin(object): def __init__(self, initial_condition, gradient_map, + local_scale, stepsize): (self.state, @@ -115,6 +118,7 @@ def __init__(self, self.stepsize) = (np.copy(initial_condition), gradient_map, stepsize) + self.local_scale = local_scale self._shape = self.state.shape[0] self._sqrt_step = np.sqrt(self.stepsize) self._noise = ndist(loc=0,scale=1) @@ -126,8 +130,8 @@ def __iter__(self): def next(self): while True: grad_posterior = self.gradient_map(self.state) - candidate = (self.state + self.stepsize * grad_posterior[0] - + np.sqrt(2.)* self._noise.rvs(self._shape) * self._sqrt_step) + candidate = (self.state + self.stepsize * self.local_scale.dot(grad_posterior[0]) + + np.sqrt(2.)* (fractional_matrix_power(self.local_scale, 0.5).dot(self._noise.rvs(self._shape))) * self._sqrt_step) if not np.all(np.isfinite(self.gradient_map(candidate)[0])): self.stepsize *= 0.5 diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index eae063fcc..26f370409 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1604,7 +1604,8 @@ def selective_MLE(observed_target, intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - return final_estimator, observed_info_mean, Z_scores, pvalues, intervals, ind_unbiased_estimator + return final_estimator, observed_info_mean, Z_scores, pvalues, intervals, ind_unbiased_estimator, \ + val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg)/2. def normalizing_constant(target_parameter, diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index a0a7ffb10..b67849eee 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -3,6 +3,7 @@ from selectinf.randomized.lasso import lasso, selected_targets from selectinf.randomized.posterior_inference import posterior_inference_lasso + def test_sampler(n=500, p=100, signal_fac=1., @@ -26,18 +27,19 @@ def test_sampler(n=500, n, p = X.shape sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ conv = const(X, Y, W, - randomizer_scale=randomizer_scale * sigma_) + randomizer_scale=randomizer_scale * dispersion) signs = conv.fit() nonzero = signs != 0 beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) (observed_target, cov_target, @@ -47,6 +49,12 @@ def test_sampler(n=500, nonzero, dispersion=dispersion) + _, inverse_info, _, _, _, _, log_ref = conv.selective_MLE(observed_target, + cov_target, + cov_target_score) + + adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + 1./100) + posterior_inf = posterior_inference_lasso(observed_target, cov_target, cov_target_score, @@ -58,13 +66,33 @@ def test_sampler(n=500, conv.b_scaling, observed_target) - samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, step=1.) + samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, local_scale = adaptive_, step=1.) lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) coverage = (lci < beta_target) * (uci > beta_target) length = uci - lci - print("check ", coverage, length) + return np.mean(coverage), np.mean(length) + + +def main(ndraw=10): + + coverage_ = 0. + length_ = 0. + for n in range(ndraw): + cov, len = test_sampler(n=400, + p=200, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.) + + coverage_ += cov + length_ += len + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.)) + print("iteration completed ", n + 1) -test_sampler() +main(ndraw=10) From 5b74daf4eb46801a66bc5be6b51a078f250a68d5 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sat, 2 May 2020 01:50:28 -0400 Subject: [PATCH 032/187] added prior var in test --- selectinf/randomized/posterior_inference.py | 10 ++++++---- selectinf/randomized/tests/test_posterior.py | 11 +++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 6d9c69a36..1e444c080 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -17,7 +17,8 @@ def __init__(self, logdens_linear, linear_part, offset, - initial_estimate): + initial_estimate, + prior_var): self.ntarget = cov_target.shape[0] self.nopt = cond_cov.shape[0] @@ -35,6 +36,7 @@ def __init__(self, self.offset = offset self.initial_estimate = initial_estimate + self.prior_var = prior_var self.set_marginal_parameters() def set_marginal_parameters(self): @@ -57,10 +59,10 @@ def set_marginal_parameters(self): self.cov_marginal = implied_cov[self.ntarget:, self.ntarget:] - def prior(self, target_parameter, scale=1., prior_var=100.): + def prior(self, target_parameter, scale=1.): - grad_prior = -target_parameter/(scale* prior_var) - log_prior = -np.linalg.norm(target_parameter)**2 /(2.* scale * prior_var) + grad_prior = -target_parameter/(scale* self.prior_var) + log_prior = -np.linalg.norm(target_parameter)**2 /(2.* scale * self.prior_var) return grad_prior, log_prior diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index b67849eee..f9e58fffc 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -10,7 +10,8 @@ def test_sampler(n=500, s=5, sigma=3., rho=0.4, - randomizer_scale=1.): + randomizer_scale=1., + prior_var = 100.): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -53,7 +54,7 @@ def test_sampler(n=500, cov_target, cov_target_score) - adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + 1./100) + adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + 1/prior_var) posterior_inf = posterior_inference_lasso(observed_target, cov_target, @@ -64,7 +65,8 @@ def test_sampler(n=500, conv.logdens_linear, conv.A_scaling, conv.b_scaling, - observed_target) + observed_target, + prior_var) samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, local_scale = adaptive_, step=1.) lci = np.percentile(samples, 5, axis=0) @@ -86,7 +88,8 @@ def main(ndraw=10): s=5, sigma=2., rho=0.4, - randomizer_scale=1.) + randomizer_scale=1., + prior_var =100) coverage_ += cov length_ += len From 58e9d85da72daa07eceeae7858f398cd78571ddd Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sun, 3 May 2020 17:34:29 -0400 Subject: [PATCH 033/187] added both samplers to posterior inference --- selectinf/randomized/posterior_inference.py | 51 +++++++++++++++----- selectinf/randomized/tests/test_posterior.py | 23 +++++---- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 1e444c080..731b9859a 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -18,6 +18,8 @@ def __init__(self, linear_part, offset, initial_estimate, + log_ref, + dispersion, prior_var): self.ntarget = cov_target.shape[0] @@ -37,6 +39,9 @@ def __init__(self, self.initial_estimate = initial_estimate self.prior_var = prior_var + self.dispersion = dispersion + self.log_ref = log_ref + self.set_marginal_parameters() def set_marginal_parameters(self): @@ -66,7 +71,7 @@ def prior(self, target_parameter, scale=1.): return grad_prior, log_prior - def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): + def log_posterior(self, target_parameter, scale=1., solve_args={'tol':1.e-12}): mean_marginal = self.linear_coef.dot(target_parameter) + self.offset_coef prec_marginal = np.linalg.inv(self.cov_marginal) @@ -91,28 +96,50 @@ def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): grad_prior, log_prior = self.prior(target_parameter) - return grad_lik + grad_prior, log_lik + log_prior + return self.dispersion * grad_lik/scale + grad_prior, self.dispersion * log_lik/scale + log_prior - (self.dispersion* self.log_ref / scale) - def posterior_sampler(self, nsample= 2000, nburnin=100, local_scale = np.identity, step=1.): + def langevin_sampler(self, nsample= 2000, nburnin=100, proposal_scale = np.identity, step=1.): state = self.initial_estimate stepsize = 1. / (step * self.ntarget) - sampler = langevin(state, self.log_posterior, local_scale, stepsize) + sampler = langevin(state, self.log_posterior, proposal_scale, stepsize) + samples = np.zeros((nsample, self.ntarget)) + + for i in range(nsample): + sampler.next(scaling_ = self.dispersion) + sys.stderr.write("sample number: " + str(i) + "sample: " + str(sampler.state.copy()) + "\n") + samples[i, :] = sampler.state.copy() + + return samples[nburnin:, :] + + def gibbs_sampler(self, nsample= 2000, nburnin=100, proposal_scale = np.identity, step=1.): + + state = self.initial_estimate + scale_state = self.dispersion + stepsize = 1. /step + + sampler = langevin(state, self.log_posterior, proposal_scale, stepsize) samples = np.zeros((nsample, self.ntarget)) for i in range(nsample): - sampler.next() - sys.stderr.write("sample number: " + str(i) + "sample: " + str(sampler.state.copy())+ "\n") + sampler.next(scaling_=scale_state) + scale_update = invgamma.rvs(a=(0.001 + self.ntarget), scale=0.001 - (scale_state * sampler.grad_posterior[1]), size=1) + + scale_state = scale_update samples[i, :] = sampler.state.copy() + sys.stderr.write("sample number: " + str(i) + "sample: " + str(samples[i, :]) + "\n") + sys.stderr.write("sample number: " + str(i) + "sigma: " + str(scale_state) + "\n") + return samples[nburnin:, :] + class langevin(object): def __init__(self, initial_condition, gradient_map, - local_scale, + proposal_scale, stepsize): (self.state, @@ -120,7 +147,7 @@ def __init__(self, self.stepsize) = (np.copy(initial_condition), gradient_map, stepsize) - self.local_scale = local_scale + self.proposal_scale = proposal_scale self._shape = self.state.shape[0] self._sqrt_step = np.sqrt(self.stepsize) self._noise = ndist(loc=0,scale=1) @@ -129,11 +156,11 @@ def __init__(self, def __iter__(self): return self - def next(self): + def next(self, scaling_): while True: - grad_posterior = self.gradient_map(self.state) - candidate = (self.state + self.stepsize * self.local_scale.dot(grad_posterior[0]) - + np.sqrt(2.)* (fractional_matrix_power(self.local_scale, 0.5).dot(self._noise.rvs(self._shape))) * self._sqrt_step) + self.grad_posterior = self.gradient_map(self.state, scaling_) + candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.grad_posterior[0]) + + np.sqrt(2.)* (fractional_matrix_power(self.proposal_scale, 0.5).dot(self._noise.rvs(self._shape))) * self._sqrt_step) if not np.all(np.isfinite(self.gradient_map(candidate)[0])): self.stepsize *= 0.5 diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index f9e58fffc..556441b04 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -4,7 +4,7 @@ from selectinf.randomized.posterior_inference import posterior_inference_lasso -def test_sampler(n=500, +def test_Langevin(n=500, p=100, signal_fac=1., s=5, @@ -66,9 +66,11 @@ def test_sampler(n=500, conv.A_scaling, conv.b_scaling, observed_target, + log_ref, + dispersion, prior_var) - samples = posterior_inf.posterior_sampler(nsample=2000, nburnin=200, local_scale = adaptive_, step=1.) + samples = posterior_inf.langevin_sampler(nsample=2000, nburnin=200, proposal_scale=adaptive_, step=1.) lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) coverage = (lci < beta_target) * (uci > beta_target) @@ -77,19 +79,20 @@ def test_sampler(n=500, return np.mean(coverage), np.mean(length) + def main(ndraw=10): coverage_ = 0. length_ = 0. for n in range(ndraw): - cov, len = test_sampler(n=400, - p=200, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - prior_var =100) + cov, len = test_Langevin(n=500, + p=200, + signal_fac=1.5, + s=5, + sigma=2., + rho=0.2, + randomizer_scale=1., + prior_var =100) coverage_ += cov length_ += len From 202ee2e28ad7279c8b394fed998fc1611280ede1 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Mon, 4 May 2020 01:23:12 -0400 Subject: [PATCH 034/187] fixed subgradient in split lasso --- selectinf/randomized/lasso.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index c9a5ec466..a4be3683d 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -928,15 +928,15 @@ def _solve_randomized_problem(self, quad = rr.identity_quadratic(self.ridge_term, 0, 0, - 0,) + 0) randomized_loss = self.loglike.subsample(self._selection_idx) randomized_loss.coef *= inv_frac problem = rr.simple_problem(randomized_loss, self.penalty) initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(self.loglike.smooth_objective(initial_soln, - 'grad') + + initial_subgrad = -(randomized_loss.smooth_objective(initial_soln, + 'grad') + quad.objective(initial_soln, 'grad')) return initial_soln, initial_subgrad From bfc82d06e0e86830f0be39cee0dd5702a10ea7d2 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sun, 17 May 2020 14:14:50 -0400 Subject: [PATCH 035/187] posterior samplers-- some changes --- selectinf/randomized/posterior_inference.py | 4 ++-- selectinf/randomized/tests/test_posterior.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 731b9859a..2af6b12ea 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -117,14 +117,14 @@ def gibbs_sampler(self, nsample= 2000, nburnin=100, proposal_scale = np.identity state = self.initial_estimate scale_state = self.dispersion - stepsize = 1. /step + stepsize = 1. /(step* self.ntarget) sampler = langevin(state, self.log_posterior, proposal_scale, stepsize) samples = np.zeros((nsample, self.ntarget)) for i in range(nsample): sampler.next(scaling_=scale_state) - scale_update = invgamma.rvs(a=(0.001 + self.ntarget), scale=0.001 - (scale_state * sampler.grad_posterior[1]), size=1) + scale_update = invgamma.rvs(a=(0.1 + self.ntarget + self.ntarget/2), scale=0.1 - (scale_state * sampler.grad_posterior[1]), size=1) scale_state = scale_update samples[i, :] = sampler.state.copy() diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 556441b04..f8c4973e9 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -70,7 +70,7 @@ def test_Langevin(n=500, dispersion, prior_var) - samples = posterior_inf.langevin_sampler(nsample=2000, nburnin=200, proposal_scale=adaptive_, step=1.) + samples = posterior_inf.langevin_sampler(nsample=2000, nburnin=200, proposal_scale=adaptive_, step=1) lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) coverage = (lci < beta_target) * (uci > beta_target) @@ -79,7 +79,6 @@ def test_Langevin(n=500, return np.mean(coverage), np.mean(length) - def main(ndraw=10): coverage_ = 0. From fa23e8902faf172602a32b530230212c89541ea7 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Mon, 22 Jun 2020 11:16:13 -0400 Subject: [PATCH 036/187] added test instances --- selectinf/randomized/posterior_inference.py | 6 +- selectinf/randomized/tests/test_posterior.py | 91 ++++++++++++++++--- .../tests/test_selective_MLE_high.py | 60 +++++++++++- 3 files changed, 137 insertions(+), 20 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 2af6b12ea..dbaee2faf 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -86,7 +86,7 @@ def log_posterior(self, target_parameter, scale=1., solve_args={'tol':1.e-12}): self.offset, **solve_args) - log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal)/2 + log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal)/2. log_lik = -((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2.\ - log_normalizer @@ -96,7 +96,7 @@ def log_posterior(self, target_parameter, scale=1., solve_args={'tol':1.e-12}): grad_prior, log_prior = self.prior(target_parameter) - return self.dispersion * grad_lik/scale + grad_prior, self.dispersion * log_lik/scale + log_prior - (self.dispersion* self.log_ref / scale) + return self.dispersion * grad_lik/scale + grad_prior, self.dispersion * log_lik/scale + log_prior - (self.dispersion* self.log_ref/scale) def langevin_sampler(self, nsample= 2000, nburnin=100, proposal_scale = np.identity, step=1.): @@ -123,7 +123,7 @@ def gibbs_sampler(self, nsample= 2000, nburnin=100, proposal_scale = np.identity samples = np.zeros((nsample, self.ntarget)) for i in range(nsample): - sampler.next(scaling_=scale_state) + sampler.next(scaling_= scale_state) scale_update = invgamma.rvs(a=(0.1 + self.ntarget + self.ntarget/2), scale=0.1 - (scale_state * sampler.grad_posterior[1]), size=1) scale_state = scale_update diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index f8c4973e9..f18dc0bde 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -54,7 +54,11 @@ def test_Langevin(n=500, cov_target, cov_target_score) - adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + 1/prior_var) + adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + np.identity(observed_target.shape[0])/ prior_var) + + A_scaling = conv.sampler.affine_con.linear_part + b_scaling = conv.sampler.affine_con.offset + logdens_linear = conv.sampler.logdens_transform[0] posterior_inf = posterior_inference_lasso(observed_target, cov_target, @@ -62,9 +66,68 @@ def test_Langevin(n=500, conv.observed_opt_state, conv.cond_mean, conv.cond_cov, - conv.logdens_linear, - conv.A_scaling, - conv.b_scaling, + logdens_linear, + A_scaling, + b_scaling, + observed_target, + log_ref, ## extra argument introduced for Gibbs update of sigma + dispersion, ## scale back the likelihood if sigma is unknown + prior_var ## prior var for the Gaussian prior + ) + + samples = posterior_inf.langevin_sampler(nsample=2000, nburnin=200, proposal_scale=adaptive_, step=1) + lci = np.percentile(samples, 5, axis=0) + uci = np.percentile(samples, 95, axis=0) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + + return np.mean(coverage), np.mean(length) + +def test_instance(): + + n, p, s = 500, 100, 5 + prior_var = 100. + X = np.random.standard_normal((n, p)) + beta = np.zeros(p) + #beta[:s] = np.sqrt(2 * np.log(p) / n) + Y = X.dot(beta) + np.random.standard_normal(n) + + scale_ = np.std(Y) + # uses noise of variance n * scale_ / 4 by default + L = lasso.gaussian(X, Y, 3 * scale_ * np.sqrt(2 * np.log(p) * np.sqrt(n))) + signs = L.fit() + E = (signs != 0) + + M = E.copy() + M[-3:] = 1 + dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(L.loglike, + L._W, + M, + dispersion=dispersion) + + print("check shapes", observed_target.shape, E.sum()) + _, inverse_info, _, _, _, _, log_ref = L.selective_MLE(observed_target, + cov_target, + cov_target_score) + + adaptive_ = np.linalg.inv(np.linalg.inv(inverse_info) + np.identity(observed_target.shape[0])/ prior_var) + A_scaling = L.sampler.affine_con.linear_part + b_scaling = L.sampler.affine_con.offset + logdens_linear = L.sampler.logdens_transform[0] + + posterior_inf = posterior_inference_lasso(observed_target, + cov_target, + cov_target_score, + L.observed_opt_state, + L.cond_mean, + L.cond_cov, + logdens_linear, + A_scaling, + b_scaling, observed_target, log_ref, dispersion, @@ -73,6 +136,8 @@ def test_Langevin(n=500, samples = posterior_inf.langevin_sampler(nsample=2000, nburnin=200, proposal_scale=adaptive_, step=1) lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) + + beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) coverage = (lci < beta_target) * (uci > beta_target) length = uci - lci @@ -84,14 +149,16 @@ def main(ndraw=10): coverage_ = 0. length_ = 0. for n in range(ndraw): - cov, len = test_Langevin(n=500, - p=200, - signal_fac=1.5, - s=5, - sigma=2., - rho=0.2, - randomizer_scale=1., - prior_var =100) + # cov, len = test_Langevin(n=500, + # p=200, + # signal_fac=1.5, + # s=5, + # sigma=2., + # rho=0.2, + # randomizer_scale=1., + # prior_var =100) + + cov, len = test_instance() coverage_ += cov length_ += len diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 4a4d4a8a5..ea98a9c02 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -1,8 +1,8 @@ import numpy as np import nose.tools as nt -from ..lasso import lasso, full_targets, selected_targets, debiased_targets -from ...tests.instance import gaussian_instance +from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets +from selectinf.tests.instance import gaussian_instance def test_full_targets(n=200, p=1000, @@ -81,7 +81,7 @@ def test_full_targets(n=200, def test_selected_targets(n=2000, p=200, - signal_fac=1., + signal_fac=10., s=5, sigma=3, rho=0.4, @@ -120,6 +120,7 @@ def test_selected_targets(n=2000, signs = conv.fit() nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) if nonzero.sum() > 0: dispersion = None @@ -134,7 +135,7 @@ def test_selected_targets(n=2000, nonzero, dispersion=dispersion) - estimate, _, _, pval, intervals, _ = conv.selective_MLE(observed_target, + estimate, _, _, pval, intervals, _, _ = conv.selective_MLE(observed_target, cov_target, cov_target_score) @@ -148,7 +149,7 @@ def main(nsim=500, full=False): P0, PA, cover, length_int = [], [], [], [] from statsmodels.distributions import ECDF - n, p, s = 500, 100, 10 + n, p, s = 500, 100, 5 for i in range(nsim): if full: @@ -171,3 +172,52 @@ def main(nsim=500, full=False): np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), np.mean(avg_length), 'null pvalue + power + length') + +def test_instance(): + + n, p, s = 500, 100, 5 + X = np.random.standard_normal((n, p)) + beta = np.zeros(p) + #beta[:s] = np.sqrt(2 * np.log(p) / n) + Y = X.dot(beta) + np.random.standard_normal(n) + + scale_ = np.std(Y) + # uses noise of variance n * scale_ / 4 by default + L = lasso.gaussian(X, Y, 3 * scale_ * np.sqrt(2 * np.log(p) * np.sqrt(n))) + signs = L.fit() + E = (signs != 0) + + M = E.copy() + M[-3:] = 1 + dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(L.loglike, + L._W, + M, + dispersion=dispersion) + + print("check shapes", observed_target.shape, E.sum()) + + estimate, _, _, pval, intervals, _, _ = L.selective_MLE(observed_target, + cov_target, + cov_target_score) + + beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) + + coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + + return coverage + +def main(nsim=500): + + cover = [] + for i in range(nsim): + + cover_ = test_instance() + cover.extend(cover_) + print(np.mean(cover), 'coverage so far ') + + +main(nsim=500) \ No newline at end of file From 19bbd55777ca5c11f8a329f309f5ac68cb48029d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 23 Jun 2020 09:25:15 -0700 Subject: [PATCH 037/187] some cleanup --- selectinf/learning/utils.py | 2 ++ selectinf/randomized/posterior_inference.py | 18 +++++++++++++----- selectinf/randomized/query.py | 4 ++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/selectinf/learning/utils.py b/selectinf/learning/utils.py index d68bc5b6a..6a0cf897a 100644 --- a/selectinf/learning/utils.py +++ b/selectinf/learning/utils.py @@ -402,8 +402,10 @@ def naive_partial_model_inference(X, return pd.DataFrame({'naive_pivot':naive_pivots, 'naive_coverage':naive_covered, 'naive_length':naive_lengths, + 'naive_pvalue':naive_pvalues, 'nfeature':X.shape[1], 'naive_lower':naive_lower, + 'naive_upper':naive_upper, 'target':final_target, 'variable':observed_list }) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 99b8be9e2..19076431e 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -3,9 +3,10 @@ import numpy as np from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from .query import _solve_barrier_affine_py from scipy.stats import norm as ndist -class posterior_inference_lasso(): +class posterior_inference_lasso(object): def __init__(self, observed_target, @@ -17,7 +18,8 @@ def __init__(self, logdens_linear, linear_part, offset, - initial_estimate): + initial_estimate, + prior_var=100.): self.ntarget = cov_target.shape[0] self.nopt = cond_cov.shape[0] @@ -38,6 +40,8 @@ def __init__(self, self.set_marginal_parameters() + self.prior_var = prior_var + def set_marginal_parameters(self): target_linear = -self.logdens_linear.dot(self.cov_target_score.T.dot(self.prec_target)) @@ -58,8 +62,8 @@ def set_marginal_parameters(self): self.cov_marginal = implied_cov[self.ntarget:, self.ntarget:] - def prior(self, target_parameter, prior_var=100.): - + def prior(self, target_parameter): + prior_var = self.prior_var grad_prior = -target_parameter/prior_var log_prior = -np.linalg.norm(target_parameter)/(2.*prior_var) return grad_prior, log_prior @@ -70,7 +74,11 @@ def log_posterior(self, target_parameter, solve_args={'tol':1.e-12}): prec_marginal = np.linalg.inv(self.cov_marginal) conjugate_marginal = prec_marginal.dot(mean_marginal) - solver = solve_barrier_affine_C + useC = True + if useC: + solver = solve_barrier_affine_C + else: + solver = _solve_barrier_affine_py val, soln, hess = solver(conjugate_marginal, prec_marginal, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 06396878e..73d7aeb87 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1600,7 +1600,7 @@ def selective_MLE(observed_target, if useC: solver = solve_barrier_affine_C else: - solver = solve_barrier_affine_py + solver = _solve_barrier_affine_py val, soln, hess = solver(conjugate_arg, prec_opt, @@ -1734,7 +1734,7 @@ def normalizing_constant(target_parameter, if useC: solver = solve_barrier_affine_C else: - solver = solve_barrier_affine_py + solver = _solve_barrier_affine_py value, soln, hess = solver(-linear_term, full_Q, From 3932618e6f4e3328ecf36b7bd31273a067f7c062 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 23 Jun 2020 09:30:11 -0700 Subject: [PATCH 038/187] using data frame summary output --- selectinf/algorithms/tests/test_compareR.py | 50 +++++++++++++++------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/selectinf/algorithms/tests/test_compareR.py b/selectinf/algorithms/tests/test_compareR.py index 51ba177cf..3727fe548 100644 --- a/selectinf/algorithms/tests/test_compareR.py +++ b/selectinf/algorithms/tests/test_compareR.py @@ -972,21 +972,45 @@ def test_rlasso_gaussian(): nonzero, penalty=L.penalty) - _, pval, intervals = L.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - opt_sample=(np.asarray(R_opt_samples),), - target_sample=np.asarray(R_target_samples), - ndraw=8000,#ndraw, - burnin=burnin, - compute_intervals=True) + result = L.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + opt_sample=(np.asarray(R_opt_samples),), + target_sample=np.asarray(R_target_samples), + ndraw=8000,#ndraw, + burnin=burnin, + compute_intervals=True) + pval = np.asarray(result['pvalue']) tol = 1.e-5 - yield np.testing.assert_allclose, initial_soln, R_soln, tol, tol, False, 'checking initial rlasso solution' - yield np.testing.assert_allclose, cond_mean, R_cond_mean, tol, tol, False, 'checking conditional mean' - yield np.testing.assert_allclose, cond_cov, R_cond_cov, tol, tol, False, 'checking conditional covariance' - yield np.testing.assert_allclose, pval, R_pvalues, tol, tol, False, 'checking pvalues' + yield (np.testing.assert_allclose, + initial_soln, + R_soln, + tol, + tol, + False, 'checking initial rlasso solution') + yield (np.testing.assert_allclose, + cond_mean, + R_cond_mean, + tol, + tol, + False, + 'checking conditional mean') + yield (np.testing.assert_allclose, + cond_cov, + R_cond_cov, + tol, + tol, + False, + 'checking conditional covariance') + yield (np.testing.assert_allclose, + pval, + R_pvalues, + tol, + tol, + False, + 'checking pvalues') break From af7705a75cd4c58ea97e63c739c12826b0bbbfae Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 23 Jun 2020 09:31:15 -0700 Subject: [PATCH 039/187] drop the losers query --- selectinf/randomized/drop_losers.py | 137 ++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 selectinf/randomized/drop_losers.py diff --git a/selectinf/randomized/drop_losers.py b/selectinf/randomized/drop_losers.py new file mode 100644 index 000000000..be1287fc5 --- /dev/null +++ b/selectinf/randomized/drop_losers.py @@ -0,0 +1,137 @@ +from __future__ import print_function + +import numpy as np +import pandas as pd + +from .query import gaussian_query + +from .randomization import randomization + +class drop_losers(gaussian_query): + + def __init__(self, + df, # should have columns 'arm', 'stage', 'data' + K=1): # how many should we move forward? + + self.df = df + self.K = K + + grouped_arm = df.groupby('arm') + self.std = grouped_arm.std()['data'] + self.means = grouped_arm.mean()['data'] + self.stages = dict([(k, v) for k, v in df.groupby('stage')]) + stage1 = df['stage'].min() + stage2 = df['stage'].max() + + df1 = self.stages[stage1] + df2 = self.stages[stage2] + + stage1_means = df1.groupby('arm').mean().sort_values('data', ascending=False) + self._winners = sorted(list(stage1_means.index[:K])) + best_loser = stage1_means['data'].iloc[K] + + n1 = df1.groupby('arm').count() + n2 = df2.groupby('arm').count() + self._n1_win = n1_win = np.array([n1.loc[lambda df: df.index == winner]['data'].iloc[0] + for winner in self._winners]) + self._n2_win = n2_win = np.array([n2.loc[lambda df: df.index == winner]['data'].iloc[0] + for winner in self._winners]) + std_win = self.std.loc[self._winners] + + A = -np.identity(K) + b = -np.ones(K) * best_loser + linear = np.identity(K) + offset = np.zeros(K) + + # Work out the implied randomization variance + # Let X1=X[stage1].mean(), X2=X[stage2].mean() and Xf = X.mean() + # with n1=len(stage1), n2=len(stage2) + + # X1 = Xf + n2/n1 * (Xf-X2) + # = Xf + n2/(n1+n2) * (X1-X2) + # so randomization term is w=n2/(n1+n2) * (X1-X2) + # with variance + # n2**2 / (n1+n2)**2 * (1/n1 + 1/n2) + # = n2**2 / (n1+n2)**2 * (n1+n2) / (n1*n2) + # = n2 / (n1 * (n1 + n2)) + + mult = n2_win / (n1_win * (n1_win + n2_win)) + + # needed for gaussian_query api + + self.randomizer = randomization.gaussian(np.diag(std_win**2) * mult) + self.observed_opt_state = stage1_means['data'].iloc[:K] + self.observed_score_state = -self.means[self._winners] # problem is a minimization + self.selection_variable = {'winners':self._winners} + + self._setup_sampler(A, b, linear, offset) + + def selective_MLE(self, + level=0.9, + solve_args={'tol':1.e-12}): + """ + + Parameters + ---------- + + level : float, optional + Confidence level. + + solve_args : dict, optional + Arguments passed to solver. + + """ + + observed_target = self.means[self._winners] + std_win = self.std.loc[self._winners] + target_cov = np.diag(std_win**2 / (self._n1_win + self._n2_win)) + target_score_cov = -target_cov + + result = gaussian_query.selective_MLE(self, + observed_target, + target_cov, + target_score_cov, + level=level, + solve_args=solve_args) + result[0].insert(0, 'arm', self._winners) + return result + + def summary(self, + level=0.9, + ndraw=10000, + burnin=2000): + + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + level : float + Confidence level. + + ndraw : int (optional) + Defaults to 1000. + + burnin : int (optional) + Defaults to 1000. + + """ + observed_target = self.means[self._winners] + std_win = self.std.loc[self._winners] + target_cov = np.diag(std_win**2 / (self._n1_win + self._n2_win)) + target_score_cov = -target_cov + + result = gaussian_query.summary(self, + observed_target, + target_cov, + target_score_cov, + alternatives=['twosided']*self.K, + ndraw=ndraw, + level=level, + burnin=burnin, + compute_intervals=True) + result.insert(0, 'arm', self._winners) + return result + From 657a212178a2e44c21d69b299b574dd5d998c14e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 23 Jun 2020 09:42:57 -0700 Subject: [PATCH 040/187] make sure we get no 0-sized samples --- selectinf/randomized/tests/test_drop_losers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_drop_losers.py b/selectinf/randomized/tests/test_drop_losers.py index 4d78d8afc..6322f5e66 100644 --- a/selectinf/randomized/tests/test_drop_losers.py +++ b/selectinf/randomized/tests/test_drop_losers.py @@ -73,7 +73,7 @@ def test_compare_topK(p=20, winners = list(stage1_means.index[:K]) for winner in winners: - N = int(np.random.poisson(30, size=(1,))) + N = int(np.random.poisson(30, size=(1,))) + 5 arm.extend([winner]*N) stage.extend([2]*N) data.extend(list(np.random.standard_normal(N))) @@ -89,7 +89,7 @@ def test_compare_topK(p=20, for a in range(p): if a not in winners: - N = int(np.random.poisson(30, size=(1,))) + N = int(np.random.poisson(30, size=(1,))) + 5 arm.extend([a]*N) stage.extend([2]*N) data.extend(list(np.random.standard_normal(N))) From 0b7d8a3065d0c3d25628e8654232421784d823bf Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 23 Jun 2020 09:44:05 -0700 Subject: [PATCH 041/187] import integer division --- selectinf/randomized/drop_losers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/randomized/drop_losers.py b/selectinf/randomized/drop_losers.py index be1287fc5..ffe2804ca 100644 --- a/selectinf/randomized/drop_losers.py +++ b/selectinf/randomized/drop_losers.py @@ -1,4 +1,4 @@ -from __future__ import print_function +from __future__ import print_function, division import numpy as np import pandas as pd From 5d0b12808e56f6b7d04ab842bd7562306adf49c4 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Wed, 24 Jun 2020 11:31:45 -0400 Subject: [PATCH 042/187] added code for selective_mle related outputs --- selectinf/randomized/tests/test_cv_mle.py | 644 ++++++++++++++++++++++ 1 file changed, 644 insertions(+) create mode 100644 selectinf/randomized/tests/test_cv_mle.py diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py new file mode 100644 index 000000000..052824840 --- /dev/null +++ b/selectinf/randomized/tests/test_cv_mle.py @@ -0,0 +1,644 @@ +import numpy as np, os, itertools +import pandas as pd + +import rpy2.robjects as rpy +from rpy2.robjects import numpy2ri +rpy.numpy2ri.activate() + +from scipy.stats import norm as ndist +from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets +#from selection.algorithms.lasso import lasso as lasso_full + +def sim_xy(n, p, nval, rho=0, s=5, beta_type=2, snr=1): + + rpy.r(''' + source('~/best-subset/bestsubset/R/sim.R') + sim_xy = sim.xy + ''') + + r_simulate = rpy.globalenv['sim_xy'] + sim = r_simulate(n, p, nval, rho, s, beta_type, snr) + X = np.array(sim.rx2('x')) + y = np.array(sim.rx2('y')) + X_val = np.array(sim.rx2('xval')) + y_val = np.array(sim.rx2('yval')) + Sigma = np.array(sim.rx2('Sigma')) + beta = np.array(sim.rx2('beta')) + sigma = np.array(sim.rx2('sigma')) + + return X, y, X_val, y_val, Sigma, beta, sigma + + +def selInf_R(X, y, beta, lam, sigma, Type, alpha=0.1): + rpy.r(''' + library("selectiveInference") + selInf = function(X, y, beta, lam, sigma, Type, alpha= 0.1){ + y = as.matrix(y) + X = as.matrix(X) + beta = as.matrix(beta) + lam = as.matrix(lam)[1,1] + sigma = as.matrix(sigma)[1,1] + Type = as.matrix(Type)[1,1] + if(Type == 1){ + type = "full"} else{ + type = "partial"} + inf = fixedLassoInf(x = X, y = y, beta = beta, lambda=lam, family = "gaussian", + intercept=FALSE, sigma=sigma, alpha=alpha, type=type) + return(list(ci = inf$ci, pvalue = inf$pv))} + ''') + + inf_R = rpy.globalenv['selInf'] + n, p = X.shape + r_X = rpy.r.matrix(X, nrow=n, ncol=p) + r_y = rpy.r.matrix(y, nrow=n, ncol=1) + r_beta = rpy.r.matrix(beta, nrow=p, ncol=1) + r_lam = rpy.r.matrix(lam, nrow=1, ncol=1) + r_sigma = rpy.r.matrix(sigma, nrow=1, ncol=1) + r_Type = rpy.r.matrix(Type, nrow=1, ncol=1) + output = inf_R(r_X, r_y, r_beta, r_lam, r_sigma, r_Type) + ci = np.array(output.rx2('ci')) + pvalue = np.array(output.rx2('pvalue')) + return ci, pvalue + + +def glmnet_lasso(X, y, lambda_val): + rpy.r(''' + library(glmnet) + glmnet_LASSO = function(X,y, lambda){ + y = as.matrix(y) + X = as.matrix(X) + lam = as.matrix(lambda)[1,1] + n = nrow(X) + + fit = glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) + estimate = coef(fit, s=lam, exact=TRUE, x=X, y=y)[-1] + fit.cv = cv.glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) + estimate.1se = coef(fit.cv, s='lambda.1se', exact=TRUE, x=X, y=y)[-1] + estimate.min = coef(fit.cv, s='lambda.min', exact=TRUE, x=X, y=y)[-1] + return(list(estimate = estimate, estimate.1se = estimate.1se, estimate.min = estimate.min, lam.min = fit.cv$lambda.min, lam.1se = fit.cv$lambda.1se)) + }''') + + lambda_R = rpy.globalenv['glmnet_LASSO'] + n, p = X.shape + r_X = rpy.r.matrix(X, nrow=n, ncol=p) + r_y = rpy.r.matrix(y, nrow=n, ncol=1) + r_lam = rpy.r.matrix(lambda_val, nrow=1, ncol=1) + + estimate = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate')) + estimate_1se = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.1se')) + estimate_min = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.min')) + lam_min = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.min'))) + lam_1se = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.1se'))) + return estimate, estimate_1se, estimate_min, lam_min, lam_1se + + +def coverage(intervals, pval, target, truth): + pval_alt = (pval[truth != 0]) < 0.1 + if pval_alt.sum() > 0: + avg_power = np.mean(pval_alt) + else: + avg_power = 0. + return np.mean((target > intervals[:, 0]) * (target < intervals[:, 1])), avg_power + + +def BHfilter(pval, q=0.2): + rpy.r.assign('pval', pval) + rpy.r.assign('q', q) + rpy.r('Pval = p.adjust(pval, method="BH")') + rpy.r('S = which((Pval < q)) - 1') + S = rpy.r('S') + ind = np.zeros(pval.shape[0], np.bool) + ind[np.asarray(S, np.int)] = 1 + return ind + + +def relative_risk(est, truth, Sigma): + if (truth != 0).sum > 0: + return (est - truth).T.dot(Sigma).dot(est - truth) / truth.T.dot(Sigma).dot(truth) + else: + return (est - truth).T.dot(Sigma).dot(est - truth) + + +def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, + randomizer_scale=np.sqrt(0.50), full_dispersion=True, + tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): + + X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) + true_mean = X.dot(beta) + print("snr", snr) + X -= X.mean(0)[None, :] + X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) + y = y - y.mean() + true_set = np.asarray([u for u in range(p) if beta[u] != 0]) + + if full_dispersion: + dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) + sigma_ = np.sqrt(dispersion) + else: + dispersion = None + sigma_ = np.std(y) + print("estimated and true sigma", sigma, sigma_) + + lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) + glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory/float(n)) + if tuning_nonrand == "lambda.min": + lam_LASSO = lam_min + glm_LASSO = glm_LASSO_min + elif tuning_nonrand == "lambda.1se": + lam_LASSO = lam_1se + glm_LASSO = glm_LASSO_1se + else: + lam_LASSO = lam_theory/float(n) + glm_LASSO = glm_LASSO_theory + active_LASSO = (glm_LASSO != 0) + nactive_LASSO = active_LASSO.sum() + active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) + active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], np.bool) + + rel_LASSO = np.zeros(p) + Lee_nreport = 0 + bias_Lee = 0. + bias_naive = 0. + + if nactive_LASSO > 0: + post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) + rel_LASSO[active_LASSO] = post_LASSO_OLS + Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) + Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=0, alpha=0.1) + + if (Lee_pval.shape[0] == Lee_target.shape[0]): + + cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) + inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) + inf_entries = np.mean(inf_entries_bool) + if inf_entries == 1.: + length_Lee = 0. + else: + length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) + power_Lee = ((active_LASSO_bool) * (np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))) \ + .sum() / float((beta != 0).sum()) + Lee_discoveries = BHfilter(Lee_pval, q=0.1) + power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) + fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(max(Lee_discoveries.sum(), 1.)) + bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) + + naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) + naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, + post_LASSO_OLS + 1.65 * naive_sd]).T + naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) + cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) + length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) + power_naive = ((active_LASSO_bool) * ( + np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( + (beta != 0).sum()) + naive_discoveries = BHfilter(naive_pval, q=0.1) + power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) + fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) + bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) + + partial_Lasso_risk = (glm_LASSO[active_LASSO]-Lee_target).T.dot(glm_LASSO[active_LASSO]-Lee_target) + partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) + + else: + Lee_nreport = 1 + cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] + cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] + naive_discoveries = np.zeros(1) + Lee_discoveries = np.zeros(1) + partial_Lasso_risk, partial_relLasso_risk = [0., 0.] + elif nactive_LASSO == 0: + Lee_nreport = 1 + cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] + cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] + naive_discoveries = np.zeros(1) + Lee_discoveries = np.zeros(1) + partial_Lasso_risk, partial_relLasso_risk = [0., 0.] + + if tuning_rand == "lambda.min": + randomized_lasso = lasso.gaussian(X, + y, + feature_weights=n * lam_min * np.ones(p), + randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) + elif tuning_rand == "lambda.1se": + randomized_lasso = lasso.gaussian(X, + y, + feature_weights=n * lam_1se * np.ones(p), + randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) + else: + randomized_lasso = lasso.gaussian(X, + y, + feature_weights= lam_theory * np.ones(p), + randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) + signs = randomized_lasso.fit() + nonzero = signs != 0 + active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) + active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) + sel_MLE = np.zeros(p) + ind_est = np.zeros(p) + randomized_lasso_est = np.zeros(p) + randomized_rel_lasso_est = np.zeros(p) + MLE_nreport = 0 + + if nonzero.sum() > 0: + target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(randomized_lasso.loglike, + randomized_lasso._W, + nonzero, + dispersion=dispersion) + + result = randomized_lasso.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + MLE_estimate = result['MLE'] + ind_unbiased_estimator = result['unbiased'] + + sel_MLE[nonzero] = MLE_estimate + ind_est[nonzero] = ind_unbiased_estimator + MLE_intervals = np.asarray(result[['lower', 'upper']]) + MLE_pval = np.asarray(result['pvalue']) + + randomized_lasso_est = randomized_lasso.initial_soln + randomized_rel_lasso_est = randomized_lasso._beta_full + + cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) + length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) + power_MLE = ((active_rand_bool) * ( + np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) + MLE_discoveries = BHfilter(MLE_pval, q=0.1) + power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) + fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) + bias_MLE = np.mean(MLE_estimate - target_randomized) + + partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) + partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) + partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) + partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) + + else: + MLE_nreport = 1 + cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] + MLE_discoveries = np.zeros(1) + partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] + + risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), + relative_risk(ind_est, beta, Sigma), + relative_risk(randomized_lasso_est, beta, Sigma), + relative_risk(randomized_rel_lasso_est, beta, Sigma), + relative_risk(rel_LASSO, beta, Sigma), + relative_risk(glm_LASSO, beta, Sigma))) + + partial_risks = np.vstack((partial_MLE_risk, + partial_ind_risk, + partial_randLasso_risk, + partial_relrandLasso_risk, + partial_relLasso_risk, + partial_Lasso_risk)) + + naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, + naive_discoveries.sum())) + Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH, + Lee_discoveries.sum())) + Liu_inf = np.zeros((10, 1)) + MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, + MLE_discoveries.sum())) + nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) + + return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) + + +def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, + randomizer_scale=np.sqrt(0.25), full_dispersion=True, + tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): + + X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) + print("snr", snr) + X -= X.mean(0)[None, :] + X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) + y = y - y.mean() + true_set = np.asarray([u for u in range(p) if beta[u] != 0]) + + if full_dispersion: + dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) + sigma_ = np.sqrt(dispersion) + else: + dispersion = None + sigma_ = np.std(y) + print("estimated and true sigma", sigma, sigma_) + + lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) + glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory/float(n)) + if tuning_nonrand == "lambda.min": + lam_LASSO = lam_min + glm_LASSO = glm_LASSO_min + elif tuning_nonrand == "lambda.1se": + lam_LASSO = lam_1se + glm_LASSO = glm_LASSO_1se + else: + lam_LASSO = lam_theory/float(n) + glm_LASSO = glm_LASSO_theory + + active_LASSO = (glm_LASSO != 0) + nactive_LASSO = active_LASSO.sum() + active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) + active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], + np.bool) + + rel_LASSO = np.zeros(p) + Lee_nreport = 0 + bias_Lee = 0. + bias_naive = 0. + + if nactive_LASSO > 0: + rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) + Lee_target = beta[active_LASSO] + Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=1, alpha=0.1) + + if (Lee_pval.shape[0] == Lee_target.shape[0]): + + cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) + inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) + inf_entries = np.mean(inf_entries_bool) + if inf_entries == 1.: + length_Lee = 0. + else: + length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) + power_Lee = ((active_LASSO_bool) * ( + np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))).sum() / float((beta != 0).sum()) + Lee_discoveries = BHfilter(Lee_pval, q=0.1) + power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) + fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(max(Lee_discoveries.sum(), 1.)) + bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) + + post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) + naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) + naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, + post_LASSO_OLS + 1.65 * naive_sd]).T + naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) + cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) + length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) + power_naive = ((active_LASSO_bool) * ( + np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( + (beta != 0).sum()) + naive_discoveries = BHfilter(naive_pval, q=0.1) + power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) + fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) + bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) + + partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) + partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) + else: + Lee_nreport = 1 + cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] + cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] + naive_discoveries = np.zeros(1) + Lee_discoveries = np.zeros(1) + partial_Lasso_risk, partial_relLasso_risk = [0., 0.] + + elif nactive_LASSO == 0: + Lee_nreport = 1 + cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] + cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] + naive_discoveries = np.zeros(1) + Lee_discoveries = np.zeros(1) + partial_Lasso_risk, partial_relLasso_risk = [0., 0.] + + lasso_Liu = lasso_full.gaussian(X, y, n * lam_LASSO) + Lasso_soln_Liu = lasso_Liu.fit() + active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0] + nactive_Liu = active_set_Liu.shape[0] + active_Liu_bool = np.asarray([(np.in1d(active_set_Liu[a], true_set).sum() > 0) for a in range(nactive_Liu)], np.bool) + Liu_nreport = 0 + + if nactive_Liu > 0: + Liu_target = beta[Lasso_soln_Liu != 0] + df = lasso_Liu.summary(level=0.90, compute_intervals=True, dispersion=dispersion) + Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \ + np.asarray(df['upper_confidence']), \ + np.asarray(df['pval']) + Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T + cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval, Liu_target, beta[Lasso_soln_Liu != 0]) + length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0]) + power_Liu = ((active_Liu_bool) * (np.logical_or((0. < Liu_intervals[:, 0]), + (0. > Liu_intervals[:, 1])))).sum() / float((beta != 0).sum()) + Liu_discoveries = BHfilter(Liu_pval, q=0.1) + power_Liu_BH = (Liu_discoveries * active_Liu_bool).sum() / float((beta != 0).sum()) + fdr_Liu_BH = (Liu_discoveries * ~active_Liu_bool).sum() / float(max(Liu_discoveries.sum(), 1.)) + + else: + Liu_nreport = 1 + cov_Liu, length_Liu, power_Liu, power_Liu_BH, fdr_Liu_BH, selective_Liu_power = [0., 0., 0., 0., 0., 0.] + Liu_discoveries = np.zeros(1) + + if tuning_rand == "lambda.min": + randomized_lasso = lasso.gaussian(X, + y, + feature_weights= n * lam_min * np.ones(p), + randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) + elif tuning_rand == "lambda.1se": + randomized_lasso = lasso.gaussian(X, + y, + feature_weights= n * lam_1se * np.ones(p), + randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) + else: + randomized_lasso = lasso.gaussian(X, + y, + feature_weights= lam_theory * np.ones(p), + randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) + signs = randomized_lasso.fit() + nonzero = signs != 0 + active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) + active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) + sel_MLE = np.zeros(p) + ind_est = np.zeros(p) + randomized_lasso_est = np.zeros(p) + randomized_rel_lasso_est = np.zeros(p) + MLE_nreport = 0 + + if nonzero.sum() > 0: + target_randomized = beta[nonzero] + (observed_target, + cov_target, + cov_target_score, + alternatives) = full_targets(randomized_lasso.loglike, + randomized_lasso._W, + nonzero, + dispersion=dispersion) + + result = randomized_lasso.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + MLE_estimate = result['MLE'] + ind_unbiased_estimator = result['unbiased'] + + sel_MLE[nonzero] = MLE_estimate + ind_est[nonzero] = ind_unbiased_estimator + MLE_intervals = np.asarray(result[['lower', 'upper']]) + MLE_pval = np.asarray(result['pvalue']) + + randomized_lasso_est = randomized_lasso.initial_soln + randomized_rel_lasso_est = randomized_lasso._beta_full + + cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) + length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) + power_MLE = ((active_rand_bool) * (np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) + MLE_discoveries = BHfilter(MLE_pval, q=0.1) + power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) + fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) + bias_MLE = np.mean(MLE_estimate - target_randomized) + + partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) + partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) + partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) + partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) + else: + MLE_nreport = 1 + cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] + MLE_discoveries = np.zeros(1) + partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] + + risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), + relative_risk(ind_est, beta, Sigma), + relative_risk(randomized_lasso_est, beta, Sigma), + relative_risk(randomized_rel_lasso_est, beta, Sigma), + relative_risk(rel_LASSO, beta, Sigma), + relative_risk(glm_LASSO, beta, Sigma))) + + partial_risks = np.vstack((partial_MLE_risk, + partial_ind_risk, + partial_randLasso_risk, + partial_relrandLasso_risk, + partial_relLasso_risk, + partial_Lasso_risk)) + + naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, + power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) + Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, + power_Lee, power_Lee_BH, fdr_Lee_BH, Lee_discoveries.sum())) + Liu_inf = np.vstack((cov_Liu, length_Liu, 0., nactive_Liu, bias_Lee, selective_Liu_power, + power_Liu, power_Liu_BH, fdr_Liu_BH, Liu_discoveries.sum())) + MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, + power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) + nreport = np.vstack((Lee_nreport, Liu_nreport, MLE_nreport)) + + return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) + + + +def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0.20, 0.31]), + target="selected", tuning_nonrand="lambda.1se", tuning_rand="lambda.1se", + randomizing_scale = np.sqrt(0.50), ndraw = 50, outpath = None): + + df_selective_inference = pd.DataFrame() + df_risk = pd.DataFrame() + + if n > p: + full_dispersion = True + else: + full_dispersion = False + + snr_list = [] + snr_list_0 = [] + for snr in snr_values: + snr_list.append(snr*np.ones(4)) + snr_list_0.append(snr*np.ones(2)) + output_overall = np.zeros(55) + if target == "selected": + for i in range(ndraw): + output_overall += np.squeeze(comparison_cvmetrics_selected(n=n, p=p, nval=n, rho=rho, s=s, beta_type=beta_type, snr=snr, + randomizer_scale=randomizing_scale, full_dispersion=full_dispersion, + tuning_nonrand =tuning_nonrand, tuning_rand=tuning_rand)) + elif target == "full": + for i in range(ndraw): + output_overall += np.squeeze(comparison_cvmetrics_full(n=n, p=p, nval=n, rho=rho, s=s, beta_type=beta_type, snr=snr, + randomizer_scale=randomizing_scale, full_dispersion=full_dispersion, + tuning_nonrand =tuning_nonrand, tuning_rand=tuning_rand)) + + nLee = output_overall[52] + nLiu = output_overall[53] + nMLE = output_overall[54] + + relative_risk = (output_overall[0:6] / float(ndraw)).reshape((1, 6)) + partial_risk = np.hstack(((output_overall[46:50] / float(ndraw-nMLE)).reshape((1, 4)), + (output_overall[50:52] / float(ndraw - nLee)).reshape((1, 2)))) + + nonrandomized_naive_inf = np.hstack(((output_overall[6:12] / float(ndraw - nLee)).reshape((1, 6)), + (output_overall[12:16] / float(ndraw)).reshape((1, 4)))) + nonrandomized_Lee_inf = np.hstack(((output_overall[16:22] / float(ndraw - nLee)).reshape((1, 6)), + (output_overall[22:26] / float(ndraw)).reshape((1, 4)))) + nonrandomized_Liu_inf = np.hstack(((output_overall[26:32] / float(ndraw - nLiu)).reshape((1, 6)), + (output_overall[32:36] / float(ndraw)).reshape((1, 4)))) + randomized_MLE_inf = np.hstack(((output_overall[36:42] / float(ndraw - nMLE)).reshape((1, 6)), + (output_overall[42:46] / float(ndraw)).reshape((1, 4)))) + + if target=="selected": + nonrandomized_Liu_inf[nonrandomized_Liu_inf==0] = 'NaN' + if target == "debiased": + nonrandomized_Liu_inf[nonrandomized_Liu_inf == 0] = 'NaN' + nonrandomized_Lee_inf[nonrandomized_Lee_inf == 0] = 'NaN' + + df_naive = pd.DataFrame(data=nonrandomized_naive_inf,columns=['coverage', 'length', 'prop-infty', 'tot-active', 'bias', 'sel-power', + 'power', 'power-BH', 'fdr-BH','tot-discoveries']) + df_naive['method'] = "Naive" + df_Lee = pd.DataFrame(data=nonrandomized_Lee_inf, columns=['coverage', 'length', 'prop-infty','tot-active','bias', 'sel-power', + 'power', 'power-BH', 'fdr-BH','tot-discoveries']) + df_Lee['method'] = "Lee" + + df_Liu = pd.DataFrame(data=nonrandomized_Liu_inf,columns=['coverage', 'length', 'prop-infty', 'tot-active','bias', 'sel-power', + 'power', 'power-BH', 'fdr-BH', 'tot-discoveries']) + df_Liu['method'] = "Liu" + + df_MLE = pd.DataFrame(data=randomized_MLE_inf, columns=['coverage', 'length', 'prop-infty', 'tot-active','bias', 'sel-power', + 'power', 'power-BH', 'fdr-BH', 'tot-discoveries']) + df_MLE['method'] = "MLE" + + df_risk_metrics = pd.DataFrame(data=relative_risk, columns=['sel-MLE', 'ind-est', 'rand-LASSO','rel-rand-LASSO', 'rel-LASSO', 'LASSO']) + df_risk_metrics['metric'] = "Full" + df_prisk_metrics = pd.DataFrame(data=partial_risk,columns=['sel-MLE', 'ind-est', 'rand-LASSO', 'rel-rand-LASSO', 'rel-LASSO','LASSO']) + df_prisk_metrics['metric'] = "Partial" + + df_selective_inference = df_selective_inference.append(df_naive, ignore_index=True) + df_selective_inference = df_selective_inference.append(df_Lee, ignore_index=True) + df_selective_inference = df_selective_inference.append(df_Liu, ignore_index=True) + df_selective_inference = df_selective_inference.append(df_MLE, ignore_index=True) + + df_risk = df_risk.append(df_risk_metrics, ignore_index=True) + df_risk = df_risk.append(df_prisk_metrics, ignore_index=True) + + snr_list = list(itertools.chain.from_iterable(snr_list)) + df_selective_inference['n'] = n + df_selective_inference['p'] = p + df_selective_inference['s'] = s + df_selective_inference['rho'] = rho + df_selective_inference['beta-type'] = beta_type + df_selective_inference['snr'] = pd.Series(np.asarray(snr_list)) + df_selective_inference['target'] = target + + snr_list_0 = list(itertools.chain.from_iterable(snr_list_0)) + df_risk['n'] = n + df_risk['p'] = p + df_risk['s'] = s + df_risk['rho'] = rho + df_risk['beta-type'] = beta_type + df_risk['snr'] = pd.Series(np.asarray(snr_list_0)) + df_risk['target'] = target + + if outpath is None: + outpath = os.path.dirname(__file__) + + outfile_inf_csv = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_inference_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".csv") + outfile_risk_csv = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".csv") + outfile_inf_html = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_inference_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".html") + outfile_risk_html = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".html") + df_selective_inference.to_csv(outfile_inf_csv, index=False) + df_risk.to_csv(outfile_risk_csv, index=False) + df_selective_inference.to_html(outfile_inf_html) + df_risk.to_html(outfile_risk_html) + +if __name__ == "__main__": + main() From 2cf0e52a4c13befe944b0d2ecfd6c0ecdc7fd793 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Wed, 24 Jun 2020 13:49:44 -0400 Subject: [PATCH 043/187] added plots to the examples-MLE file --- selectinf/randomized/tests/test_cv_mle.py | 180 +++++++++++++++++++++- 1 file changed, 179 insertions(+), 1 deletion(-) diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py index 052824840..bd356ef6b 100644 --- a/selectinf/randomized/tests/test_cv_mle.py +++ b/selectinf/randomized/tests/test_cv_mle.py @@ -118,10 +118,183 @@ def relative_risk(est, truth, Sigma): else: return (est - truth).T.dot(Sigma).dot(est - truth) +from rpy2 import robjects + +def plotRisk(df_risk): + robjects.r(""" + library("ggplot2") + library("magrittr") + library("tidyr") + library("dplyr") + + plot_risk <- function(df_risk, outpath="/Users/psnigdha/adjusted_MLE/plots/", resolution=300, height= 7.5, width=15) + { + date = 1:length(unique(df_risk$snr)) + df_risk = filter(df_risk, metric == "Full") + df = cbind(df_risk, date) + risk = df %>% + gather(key, value, sel.MLE, rand.LASSO, LASSO) %>% + ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + + geom_point(size=3) + + geom_line(aes(linetype=key), size=1) + + ylim(0.01,1.2)+ + labs(y="relative risk", x = "Signal regimes: snr") + + scale_x_continuous(breaks=1:length(unique(df_risk$snr)), label = sapply(df_risk$snr, toString)) + + theme(legend.position="top", legend.title = element_blank()) + indices = sort(c("sel.MLE", "rand.LASSO", "LASSO"), index.return= TRUE)$ix + names = c("sel-MLE", "rand-LASSO", "LASSO") + risk = risk + scale_color_manual(labels = names[indices], values=c("#008B8B", "#104E8B","#B22222")[indices]) + + scale_shape_manual(labels = names[indices], values=c(15, 17, 16)[indices]) + + scale_linetype_manual(labels = names[indices], values = c(1,1,2)[indices]) + outfile = paste(outpath, 'risk.png', sep="") + outfile = paste(outpath, 'risk.png', sep="") + ggsave(outfile, plot = risk, dpi=resolution, dev='png', height=height, width=width, units="cm")} + """) + + robjects.pandas2ri.activate() + r_df_risk = robjects.conversion.py2ri(df_risk) + R_plot = robjects.globalenv['plot_risk'] + R_plot(r_df_risk) + + +def plotCoveragePower(df_inference): + robjects.r(""" + library("ggplot2") + library("magrittr") + library("tidyr") + library("reshape") + library("cowplot") + library("dplyr") + + plot_coverage_lengths <- function(df_inference, outpath="/Users/psnigdha/adjusted_MLE/plots/", + resolution=200, height_plot1= 6.5, width_plot1=12, + height_plot2=13, width_plot2=13) + { + snr.len = length(unique(df_inference$snr)) + df_inference = arrange(df_inference, method) + target = toString(df_inference$target[1]) + df = data.frame(snr = sapply(unique(df_inference$snr), toString), + MLE = 100*df_inference$coverage[((2*snr.len)+1):(3*snr.len)], + Lee = 100*df_inference$coverage[1:snr.len], + Naive = 100*df_inference$coverage[((3*snr.len)+1):(4*snr.len)]) + if(target== "selected"){ + data.m <- melt(df, id.vars='snr') + coverage = ggplot(data.m, aes(snr, value)) + + geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + + geom_hline(yintercept = 90, linetype="dotted") + + labs(y="coverage: partial", x = "Signal regimes: snr") + + theme(legend.position="top", + legend.title = element_blank()) + coverage = coverage + + scale_fill_manual(labels = c("MLE-based","Lee", "Naive"), values=c("#008B8B", "#B22222", "#FF6347"))} else{ + df = cbind(df, Liu = 100*df_inference$coverage[((snr.len)+1):(2*snr.len)]) + df <- df[c("snr", "MLE", "Liu", "Lee", "Naive")] + data.m <- melt(df, id.vars='snr') + coverage = ggplot(data.m, aes(snr, value)) + + geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + + geom_hline(yintercept = 90, linetype="dotted") + + labs(y="coverage: full", x = "Signal regimes: snr") + + theme(legend.position="top", legend.title = element_blank()) + coverage = coverage + + scale_fill_manual(labels = c("MLE-based", "Liu", "Lee", "Naive"), values=c("#008B8B", "#104E8B", "#B22222", "#FF6347"))} + + outfile = paste(outpath, 'coverage.png', sep="") + ggsave(outfile, plot = coverage, dpi=resolution, dev='png', height=height_plot1, width=width_plot1, units="cm") + + df = data.frame(snr = sapply(unique(df_inference$snr), toString), + MLE = 100*df_inference$sel.power[((2*snr.len)+1):(3*snr.len)], + Lee = 100*df_inference$sel.power[1:snr.len]) + if(target== "selected"){ + data.m <- melt(df, id.vars='snr') + sel_power = ggplot(data.m, aes(snr, value)) + + geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + + labs(y="power: partial", x = "Signal regimes: snr") + + theme(legend.position="top", legend.title = element_blank()) + sel_power = sel_power + scale_fill_manual(labels = c("MLE-based","Lee"), values=c("#008B8B", "#B22222"))} else{ + df = cbind(df, Liu = 100*df_inference$sel.power[((snr.len)+1):(2*snr.len)]) + df <- df[,c("snr", "MLE", "Liu", "Lee")] + data.m <- melt(df, id.vars='snr') + sel_power = ggplot(data.m, aes(snr, value)) + + geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + + labs(y="power: full", x = "Signal regimes: snr") + + theme(legend.position="top", legend.title = element_blank()) + sel_power = sel_power + scale_fill_manual(labels = c("MLE-based","Liu","Lee"), values=c("#008B8B", "#104E8B", "#B22222"))} + + outfile = paste(outpath, 'selective_power.png', sep="") + ggsave(outfile, plot = sel_power, dpi=resolution, dev='png', height=height_plot1, width=width_plot1, units="cm") + + if(target== "selected"){ + test_data <-data.frame(MLE = filter(df_inference, method == "MLE")$length, + Lee = filter(df_inference, method == "Lee")$length, + Naive = filter(df_inference, method == "Naive")$length, + date = 1:length(unique(df_inference$snr))) + lengths = test_data %>% + gather(key, value, MLE, Lee, Naive) %>% + ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + + geom_point(size=3) + + geom_line(aes(linetype=key), size=1) + + ylim(0.,max(test_data$MLE, test_data$Lee, test_data$Naive) + 0.2)+ + labs(y="lengths:partial", x = "Signal regimes: snr") + + scale_x_continuous(breaks=1:length(unique(df_inference$snr)), label = sapply(unique(df_inference$snr), toString))+ + theme(legend.position="top", legend.title = element_blank()) + + indices = sort(c("MLE", "Lee", "Naive"), index.return= TRUE)$ix + names = c("MLE-based", "Lee", "Naive") + lengths = lengths + scale_color_manual(labels = names[indices], values=c("#008B8B","#B22222", "#FF6347")[indices]) + + scale_shape_manual(labels = names[indices], values=c(15, 17, 16)[indices]) + + scale_linetype_manual(labels = names[indices], values = c(1,1,2)[indices])} else{ + test_data <-data.frame(MLE = filter(df_inference, method == "MLE")$length, + Lee = filter(df_inference, method == "Lee")$length, + Naive = filter(df_inference, method == "Naive")$length, + Liu = filter(df_inference, method == "Liu")$length, + date = 1:length(unique(df_inference$snr))) + lengths= test_data %>% + gather(key, value, MLE, Lee, Naive, Liu) %>% + ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + + geom_point(size=3) + + geom_line(aes(linetype=key), size=1) + + ylim(0.,max(test_data$MLE, test_data$Lee, test_data$Naive, test_data$Liu) + 0.2)+ + labs(y="lengths: full", x = "Signal regimes: snr") + + scale_x_continuous(breaks=1:length(unique(df_inference$snr)), label = sapply(unique(df_inference$snr), toString))+ + theme(legend.position="top", legend.title = element_blank()) + + indices = sort(c("MLE", "Liu", "Lee", "Naive"), index.return= TRUE)$ix + names = c("MLE-based", "Lee", "Naive", "Liu") + lengths = lengths + scale_color_manual(labels = names[indices], values=c("#008B8B","#B22222", "#FF6347", "#104E8B")[indices]) + + scale_shape_manual(labels = names[indices], values=c(15, 17, 16, 15)[indices]) + + scale_linetype_manual(labels = names[indices], values = c(1,1,2,1)[indices])} + + prop = filter(df_inference, method == "Lee")$prop.infty + df = data.frame(snr = sapply(unique(df_inference$snr), toString), + infinite = 100*prop) + data.prop <- melt(df, id.vars='snr') + pL = ggplot(data.prop, aes(snr, value)) + + geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + + labs(y="infinite intervals (%)", x = "Signal regimes: snr") + + theme(legend.position="top", + legend.title = element_blank()) + pL = pL + scale_fill_manual(labels = c("Lee"), values=c("#B22222")) + prow <- plot_grid( pL + theme(legend.position="none"), + lengths + theme(legend.position="none"), + align = 'vh', + hjust = -1, + ncol = 1) + + legend <- get_legend(lengths+ theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom")) + p <- plot_grid(prow, ncol=1, legend, rel_heights = c(2., .2)) + outfile = paste(outpath, 'length.png', sep="") + ggsave(outfile, plot = p, dpi=resolution, dev='png', height=height_plot2, width=width_plot2, units="cm")} + """) + + robjects.pandas2ri.activate() + r_df_inference = robjects.conversion.py2ri(df_inference) + R_plot = robjects.globalenv['plot_coverage_lengths'] + R_plot(r_df_inference) def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=True, - tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): + tuning_nonrand="lambda.min", tuning_rand="lambda.1se", + plot=False): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) true_mean = X.dot(beta) @@ -640,5 +813,10 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. df_selective_inference.to_html(outfile_inf_html) df_risk.to_html(outfile_risk_html) + if plot is True: + plotRisk(df_risk) + plotCoveragePower(df_selective_inference) + + if __name__ == "__main__": main() From c91670c8a2bd2c703742e4109d5f82b1f5b0c96b Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 10:46:38 -0700 Subject: [PATCH 044/187] including sim_xy directly in script --- selectinf/randomized/tests/test_cv_mle.py | 116 ++++++++++++++++++++-- 1 file changed, 107 insertions(+), 9 deletions(-) diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py index bd356ef6b..7c8e16c64 100644 --- a/selectinf/randomized/tests/test_cv_mle.py +++ b/selectinf/randomized/tests/test_cv_mle.py @@ -7,14 +7,107 @@ from scipy.stats import norm as ndist from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets -#from selection.algorithms.lasso import lasso as lasso_full +from selectinf.algorithms.lasso import ROSI def sim_xy(n, p, nval, rho=0, s=5, beta_type=2, snr=1): rpy.r(''' - source('~/best-subset/bestsubset/R/sim.R') - sim_xy = sim.xy - ''') + + #' Predictors and responses generation. + #' + #' Generate a predictor matrix x, and response vector y, following a specified + #' setup. Actually, two pairs of predictors and responses are generated: + #' one for training, and one for validation. + #' + #' @param n,p The number of training observations, and the number of predictors. + #' @param nval The number of validation observations. + #' @param rho Parameter that drives pairwise correlations of the predictor + #' variables; specifically, predictors i and j have population correlation + #' rho^abs(i-j). Default is 0. + #' @param s number of nonzero coefficients in the underlying regression model. + #' Default is 5. (Ignored if beta.type is 4, in which case the number of + #' nonzero coefficients is 6; and if beta.type is 5, it is interpreted as a + #' the number of strongly nonzero coefficients in a weak sparsity model.) + #' @param beta.type Integer taking values in between 1 and 5, used to specify + #' the pattern of nonzero coefficients in the underlying regression model; see + #' details below. Default is 1. + #' @param snr Desired signal-to-noise ratio (SNR), i.e., var(mu)/sigma^2 where + #' mu is mean and sigma^2 is the error variance. The error variance is set so + #' that the given SNR is achieved. Default is 1. + #' @return A list with the following components: x, y, xval, yval, Sigma, beta, + #' and sigma. + #' + #' @details The data model is: \eqn{Y \sim N(X\beta, \sigma^2 I)}. + #' The predictor variables have covariance matrix Sigma, with (i,j)th entry + #' rho^abs(i-j). The error variance sigma^2 is set according to the desired + #' signal-to-noise ratio. The first 4 options for the nonzero pattern + #' of the underlying regression coefficients beta follow the simulation setup + #' in Bertsimas, King, and Mazumder (2016), and the 5th is a weak sparsity + #' option: + #' \itemize{ + #' \item 1: beta has s components of 1, occurring at (roughly) equally-spaced + #' indices in between 1 and p + #' \item 2: beta has its first s components equal to 1 + #' \item 3: beta has its first s components taking nonzero values, where the + #' decay in a linear fashion from 10 to 0.5 + #' \item 4: beta has its first 6 components taking the nonzero values -10,-6, + #' -2,2,6,10 + #' \item 5: beta has its first s components equal to 1, and the rest decaying + #' to zero at an exponential rate + #' } + #' + #' @author Trevor Hastie, Rob Tibshirani, Ryan Tibshirani + #' @references Simulation setup based on "Best subset selection via a modern + #' optimization lens" by Dimitris Bertsimas, Angela King, and Rahul Mazumder, + #' Annals of Statistics, 44(2), 813-852, 2016. + #' @example examples/ex.fs.R + #' @export sim.xy + + sim.xy = function(n, p, nval, rho=0, s=5, beta.type=1, snr=1) { + # Generate predictors + x = matrix(rnorm(n*p),n,p) + xval = matrix(rnorm(nval*p),nval,p) + + # Introduce autocorrelation, if needed + if (rho != 0) { + inds = 1:p + Sigma = rho^abs(outer(inds, inds, "-")) + obj = svd(Sigma) + Sigma.half = obj$u %*% (sqrt(diag(obj$d))) %*% t(obj$v) + x = x %*% Sigma.half + xval = xval %*% Sigma.half + } + else Sigma = diag(1,p) + + # Generate underlying coefficients + s = min(s,p) + beta = rep(0,p) + if (beta.type==1) { + beta[round(seq(1,p,length=s))] = 1 + } else if (beta.type==2) { + beta[1:s] = 1 + } else if (beta.type==3) { + beta[1:s] = seq(10,0.5,length=s) + } else if (beta.type==4) { + beta[1:6] = c(-10,-6,-2,2,6,10) + } else { + beta[1:s] = 1 + beta[(s+1):p] = 0.5^(1:(p-s)) + } + + # Set snr based on sample variance on infinitely large test set + vmu = as.numeric(t(beta) %*% Sigma %*% beta) + sigma = sqrt(vmu/snr) + + # Generate responses + y = as.numeric(x %*% beta + rnorm(n)*sigma) + yval = as.numeric(xval %*% beta + rnorm(nval)*sigma) + + list(x=x,y=y,xval=xval,yval=yval,Sigma=Sigma,beta=beta,sigma=sigma) + } + + sim_xy = sim.xy + ''') r_simulate = rpy.globalenv['sim_xy'] sim = r_simulate(n, p, nval, rho, s, beta_type, snr) @@ -113,7 +206,7 @@ def BHfilter(pval, q=0.2): def relative_risk(est, truth, Sigma): - if (truth != 0).sum > 0: + if (truth != 0).sum() > 0: return (est - truth).T.dot(Sigma).dot(est - truth) / truth.T.dot(Sigma).dot(truth) else: return (est - truth).T.dot(Sigma).dot(est - truth) @@ -337,8 +430,11 @@ def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_ty post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) rel_LASSO[active_LASSO] = post_LASSO_OLS Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) - Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=0, alpha=0.1) - + try: + Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=0, alpha=0.1) + except: + Lee_intervals, Lee_pval = np.array([]), np.array([]) + if (Lee_pval.shape[0] == Lee_target.shape[0]): cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) @@ -580,7 +676,8 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 Lee_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - lasso_Liu = lasso_full.gaussian(X, y, n * lam_LASSO) + lasso_Liu = ROSI.gaussian(X, y, n * lam_LASSO) + print(type(lasso_Liu)) Lasso_soln_Liu = lasso_Liu.fit() active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0] nactive_Liu = active_set_Liu.shape[0] @@ -705,7 +802,7 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0.20, 0.31]), target="selected", tuning_nonrand="lambda.1se", tuning_rand="lambda.1se", - randomizing_scale = np.sqrt(0.50), ndraw = 50, outpath = None): + randomizing_scale = np.sqrt(0.50), ndraw=2, outpath = None): df_selective_inference = pd.DataFrame() df_risk = pd.DataFrame() @@ -820,3 +917,4 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. if __name__ == "__main__": main() + main(target="full") From 7aab2a944f78f4cf0fca31233526d1117aa7350c Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 14:54:52 -0700 Subject: [PATCH 045/187] renaming columns in output --- selectinf/algorithms/lasso.py | 14 +++++++------- selectinf/randomized/query.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/selectinf/algorithms/lasso.py b/selectinf/algorithms/lasso.py index f885eb964..674174510 100644 --- a/selectinf/algorithms/lasso.py +++ b/selectinf/algorithms/lasso.py @@ -323,7 +323,7 @@ def summary(self, df = pd.DataFrame(index=self.active, data=dict([(n, d) for n, d in zip(['variable', - 'pval', + 'pvalue', 'lasso', 'onestep', 'lower_confidence', @@ -1487,7 +1487,7 @@ def _data_carving_deprec(X, y, splitting_pvalues, splitting_intervals), L else: - pvalues = [p for _, p in L.summary("twosided")['pval']] + pvalues = [p for _, p in L.summary("twosided")['pvalue']] intervals = np.array([L.intervals['lower'], L.intervals['upper']]).T if splitting: splitting_pvalues = np.random.sample(len(pvalues)) @@ -1957,14 +1957,14 @@ def summary(self, Estimate of dispersion. Defaults to a Pearson's X^2 estimate in the relaxed model. truth : np.array - True values of each beta for selected variables. If not None, a column 'pval' are p-values + True values of each beta for selected variables. If not None, a column 'pvalue' are p-values computed under these corresponding null hypotheses. Returns ------- pval_summary : np.recarray Array with one entry per active variable. - Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'. + Columns are 'variable', 'pvalue', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'. """ if len(self.active) > 0: @@ -2009,7 +2009,7 @@ def summary(self, df = pd.DataFrame(index=self.active, data=dict([(n, d) for n, d in zip(['variable', - 'pval', + 'pvalue', 'lasso', 'onestep', 'sd', @@ -2328,7 +2328,7 @@ def summary(self, level=0.05, ------- pval_summary : np.recarray Array with one entry per active variable. - Columns are 'variable', 'pval', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'. + Columns are 'variable', 'pvalue', 'lasso', 'onestep', 'lower_trunc', 'upper_trunc', 'sd'. """ if len(self.active) > 0: @@ -2366,7 +2366,7 @@ def summary(self, level=0.05, df = pd.DataFrame(index=self.active, data=dict([(n, d) for n, d in zip(['variable', - 'pval', + 'pvalue', 'lasso', 'onestep', 'sd', diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index c703afa8c..19fb677bb 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -194,7 +194,7 @@ def summary(self, observed_target, target_cov, target_score_cov)[0] - MLE_intervals = np.asarray(MLE[['lower', 'upper']]) + MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) intervals = self.sampler.confidence_intervals( observed_target, @@ -205,8 +205,8 @@ def summary(self, initial_guess=MLE_intervals, level=level) - result.insert(2, 'lower', intervals[:,0]) - result.insert(3, 'upper', intervals[:,1]) + result.insert(2, 'lower_confidence', intervals[:,0]) + result.insert(3, 'upper_confidence', intervals[:,1]) if not np.all(parameter == 0): result.insert(4, 'pivot', pivots) @@ -518,8 +518,8 @@ def summary(self, result = pd.DataFrame({'target':observed_target, 'pvalue':pvalues, - 'lower':intervals[:,0], - 'upper':intervals[:,1]}) + 'lower_confidence':intervals[:,0], + 'upper_confidence':intervals[:,1]}) if not np.all(parameter == 0): result.insert(4, 'pivot', pivots) @@ -1684,8 +1684,8 @@ def selective_MLE(observed_target, 'SE':np.sqrt(np.diag(observed_info_mean)), 'Zvalue':Z_scores, 'pvalue':pvalues, - 'lower':intervals[:,0], - 'upper':intervals[:,1], + 'lower_confidence':intervals[:,0], + 'upper_confidence':intervals[:,1], 'unbiased':ind_unbiased_estimator}) return result, observed_info_mean, log_ref From 0d3b6a55f61f3e93a94ee793be79d5ab394f631f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 14:55:44 -0700 Subject: [PATCH 046/187] updates on mle script --- selectinf/randomized/tests/test_cv_mle.py | 160 +++++++++++++++++----- 1 file changed, 127 insertions(+), 33 deletions(-) diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py index 7c8e16c64..f7a1bbc19 100644 --- a/selectinf/randomized/tests/test_cv_mle.py +++ b/selectinf/randomized/tests/test_cv_mle.py @@ -1,9 +1,13 @@ +from __future__ import division + import numpy as np, os, itertools import pandas as pd import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.numpy2ri.activate() +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter from scipy.stats import norm as ndist from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets @@ -220,7 +224,7 @@ def plotRisk(df_risk): library("tidyr") library("dplyr") - plot_risk <- function(df_risk, outpath="/Users/psnigdha/adjusted_MLE/plots/", resolution=300, height= 7.5, width=15) + plot_risk <- function(df_risk, outpath="plots/", resolution=300, height= 7.5, width=15) { date = 1:length(unique(df_risk$snr)) df_risk = filter(df_risk, metric == "Full") @@ -244,8 +248,9 @@ def plotRisk(df_risk): ggsave(outfile, plot = risk, dpi=resolution, dev='png', height=height, width=width, units="cm")} """) - robjects.pandas2ri.activate() - r_df_risk = robjects.conversion.py2ri(df_risk) + #pandas2ri.activate() + with localconverter(robjects.default_converter + pandas2ri.converter): + r_df_risk = robjects.conversion.py2rpy(df_risk) R_plot = robjects.globalenv['plot_risk'] R_plot(r_df_risk) @@ -259,7 +264,7 @@ def plotCoveragePower(df_inference): library("cowplot") library("dplyr") - plot_coverage_lengths <- function(df_inference, outpath="/Users/psnigdha/adjusted_MLE/plots/", + plot_coverage_lengths <- function(df_inference, outpath="plots/", resolution=200, height_plot1= 6.5, width_plot1=12, height_plot2=13, width_plot2=13) { @@ -379,21 +384,41 @@ def plotCoveragePower(df_inference): ggsave(outfile, plot = p, dpi=resolution, dev='png', height=height_plot2, width=width_plot2, units="cm")} """) - robjects.pandas2ri.activate() - r_df_inference = robjects.conversion.py2ri(df_inference) + #pandas2ri.activate() + with localconverter(robjects.default_converter + pandas2ri.converter): + r_df_inference = robjects.conversion.py2rpy(df_inference) R_plot = robjects.globalenv['plot_coverage_lengths'] R_plot(r_df_inference) -def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, - randomizer_scale=np.sqrt(0.50), full_dispersion=True, - tuning_nonrand="lambda.min", tuning_rand="lambda.1se", - plot=False): - - X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) +def comparison_cvmetrics_selected(n=500, + p=100, + nval=500, + rho=0.35, + s=5, + beta_type=1, + snr=0.20, + randomizer_scale=np.sqrt(0.50), + full_dispersion=True, + tuning_nonrand="lambda.min", + tuning_rand="lambda.1se"): + + (X, + y, + _, + _, + Sigma, + beta, + sigma) = sim_xy(n=n, + p=p, + nval=nval, + rho=rho, + s=s, + beta_type=beta_type, + snr=snr) true_mean = X.dot(beta) - print("snr", snr) + X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) + X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) @@ -405,8 +430,17 @@ def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_ty sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) - lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) - glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory/float(n)) + lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, + np.random.standard_normal((n, + 2000)))).max(0)) + (glm_LASSO_theory, + glm_LASSO_1se, + glm_LASSO_min, + lam_min, + lam_1se) = glmnet_lasso(X, + y, + lam_theory/float(n)) + if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min @@ -416,6 +450,7 @@ def comparison_cvmetrics_selected(n=500, p=100, nval=500, rho=0.35, s=5, beta_ty else: lam_LASSO = lam_theory/float(n) glm_LASSO = glm_LASSO_theory + active_LASSO = (glm_LASSO != 0) nactive_LASSO = active_LASSO.sum() active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) @@ -802,7 +837,7 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0.20, 0.31]), target="selected", tuning_nonrand="lambda.1se", tuning_rand="lambda.1se", - randomizing_scale = np.sqrt(0.50), ndraw=2, outpath = None): + randomizing_scale = np.sqrt(0.50), ndraw=4, outpath = None, plot=True): df_selective_inference = pd.DataFrame() df_risk = pd.DataFrame() @@ -820,14 +855,30 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. output_overall = np.zeros(55) if target == "selected": for i in range(ndraw): - output_overall += np.squeeze(comparison_cvmetrics_selected(n=n, p=p, nval=n, rho=rho, s=s, beta_type=beta_type, snr=snr, - randomizer_scale=randomizing_scale, full_dispersion=full_dispersion, - tuning_nonrand =tuning_nonrand, tuning_rand=tuning_rand)) + output_overall += np.squeeze(comparison_cvmetrics_selected(n=n, + p=p, + nval=n, + rho=rho, + s=s, + beta_type=beta_type, + snr=snr, + randomizer_scale=randomizing_scale, + full_dispersion=full_dispersion, + tuning_nonrand =tuning_nonrand, + tuning_rand=tuning_rand)) elif target == "full": for i in range(ndraw): - output_overall += np.squeeze(comparison_cvmetrics_full(n=n, p=p, nval=n, rho=rho, s=s, beta_type=beta_type, snr=snr, - randomizer_scale=randomizing_scale, full_dispersion=full_dispersion, - tuning_nonrand =tuning_nonrand, tuning_rand=tuning_rand)) + output_overall += np.squeeze(comparison_cvmetrics_full(n=n, + p=p, + nval=n, + rho=rho, + s=s, + beta_type=beta_type, + snr=snr, + randomizer_scale=randomizing_scale, + full_dispersion=full_dispersion, + tuning_nonrand =tuning_nonrand, + tuning_rand=tuning_rand)) nLee = output_overall[52] nLiu = output_overall[53] @@ -852,24 +903,66 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. nonrandomized_Liu_inf[nonrandomized_Liu_inf == 0] = 'NaN' nonrandomized_Lee_inf[nonrandomized_Lee_inf == 0] = 'NaN' - df_naive = pd.DataFrame(data=nonrandomized_naive_inf,columns=['coverage', 'length', 'prop-infty', 'tot-active', 'bias', 'sel-power', - 'power', 'power-BH', 'fdr-BH','tot-discoveries']) + df_naive = pd.DataFrame(data=nonrandomized_naive_inf,columns=['coverage', + 'length', + 'prop-infty', + 'tot-active', + 'bias', + 'sel-power', + 'power', + 'power-BH', + 'fdr-BH', + 'tot-discoveries']) df_naive['method'] = "Naive" - df_Lee = pd.DataFrame(data=nonrandomized_Lee_inf, columns=['coverage', 'length', 'prop-infty','tot-active','bias', 'sel-power', - 'power', 'power-BH', 'fdr-BH','tot-discoveries']) + df_Lee = pd.DataFrame(data=nonrandomized_Lee_inf, columns=['coverage', + 'length', + 'prop-infty', + 'tot-active', + 'bias', + 'sel-power', + 'power', + 'power-BH', + 'fdr-BH', + 'tot-discoveries']) df_Lee['method'] = "Lee" - df_Liu = pd.DataFrame(data=nonrandomized_Liu_inf,columns=['coverage', 'length', 'prop-infty', 'tot-active','bias', 'sel-power', - 'power', 'power-BH', 'fdr-BH', 'tot-discoveries']) + df_Liu = pd.DataFrame(data=nonrandomized_Liu_inf,columns=['coverage', + 'length', + 'prop-infty', + 'tot-active', + 'bias', + 'sel-power', + 'power', + 'power-BH', + 'fdr-BH', + 'tot-discoveries']) df_Liu['method'] = "Liu" - df_MLE = pd.DataFrame(data=randomized_MLE_inf, columns=['coverage', 'length', 'prop-infty', 'tot-active','bias', 'sel-power', - 'power', 'power-BH', 'fdr-BH', 'tot-discoveries']) + df_MLE = pd.DataFrame(data=randomized_MLE_inf, columns=['coverage', + 'length', + 'prop-infty', + 'tot-active', + 'bias', + 'sel-power', + 'power', + 'power-BH', + 'fdr-BH', + 'tot-discoveries']) df_MLE['method'] = "MLE" - df_risk_metrics = pd.DataFrame(data=relative_risk, columns=['sel-MLE', 'ind-est', 'rand-LASSO','rel-rand-LASSO', 'rel-LASSO', 'LASSO']) + df_risk_metrics = pd.DataFrame(data=relative_risk, columns=['sel-MLE', + 'ind-est', + 'rand-LASSO', + 'rel-rand-LASSO', + 'rel-LASSO', + 'LASSO']) df_risk_metrics['metric'] = "Full" - df_prisk_metrics = pd.DataFrame(data=partial_risk,columns=['sel-MLE', 'ind-est', 'rand-LASSO', 'rel-rand-LASSO', 'rel-LASSO','LASSO']) + df_prisk_metrics = pd.DataFrame(data=partial_risk,columns=['sel-MLE', + 'ind-est', + 'rand-LASSO', + 'rel-rand-LASSO', + 'rel-LASSO', + 'LASSO']) df_prisk_metrics['metric'] = "Partial" df_selective_inference = df_selective_inference.append(df_naive, ignore_index=True) @@ -910,6 +1003,7 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. df_selective_inference.to_html(outfile_inf_html) df_risk.to_html(outfile_risk_html) + stop if plot is True: plotRisk(df_risk) plotCoveragePower(df_selective_inference) From e77d9477cc188fd1a4fd47df1a3875a640e6a489 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 23:06:12 -0700 Subject: [PATCH 047/187] adding a dispersion option to Lee et al lasso --- selectinf/algorithms/lasso.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/selectinf/algorithms/lasso.py b/selectinf/algorithms/lasso.py index 674174510..26c8cb41b 100644 --- a/selectinf/algorithms/lasso.py +++ b/selectinf/algorithms/lasso.py @@ -242,6 +242,7 @@ def summary(self, alternative='twosided', level=0.95, compute_intervals=False, + dispersion=None, truth=None): """ Summary table for inference adjusted for selection. @@ -258,6 +259,9 @@ def summary(self, compute_intervals : bool Should we compute confidence intervals? + dispersion : float + Scalar to multiply `self.constraints.covaraince` + truth : np.array True values of each beta for selected variables. If not None, a column 'pval' are p-values computed under these corresponding null hypotheses. @@ -275,9 +279,14 @@ def summary(self, if truth is None: truth = np.zeros_like(self.active_signs) + if dispersion is None: + dispersion = 1. + result = [] - C = self._constraints + C = self.constraints if C is not None: + _cov = C.covariance.copy() + C.covariance = _cov * dispersion one_step = self.onestep_estimator for i in range(one_step.shape[0]): eta = np.zeros_like(one_step) @@ -296,7 +305,8 @@ def summary(self, if compute_intervals: if C.linear_part.shape[0] > 0: # there were some constraints try: - _interval = C.interval(eta, one_step, + _interval = C.interval(eta, + one_step, alpha=alpha) except OverflowError: _interval = (-np.inf, np.inf) @@ -320,7 +330,8 @@ def summary(self, lower_trunc, upper_trunc, sd)) - + C.covariance = _cov + df = pd.DataFrame(index=self.active, data=dict([(n, d) for n, d in zip(['variable', 'pvalue', @@ -2311,7 +2322,8 @@ def fit(self, self.inactive = np.arange(lasso_solution.shape[0]) return self.lasso_solution - def summary(self, level=0.05, + def summary(self, + level=0.95, compute_intervals=False, dispersion=None): """ From c3f38ab13c1a0eeb0f737c38bad574c4b84d195a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 23:06:37 -0700 Subject: [PATCH 048/187] a few minor changes to test_cv_mle script -- to be replaced by an example in compare-selection --- selectinf/randomized/tests/test_cv_mle.py | 30 +++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py index f7a1bbc19..30b8da4c5 100644 --- a/selectinf/randomized/tests/test_cv_mle.py +++ b/selectinf/randomized/tests/test_cv_mle.py @@ -167,11 +167,11 @@ def glmnet_lasso(X, y, lambda_val): lam = as.matrix(lambda)[1,1] n = nrow(X) - fit = glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) + fit = glmnet(X, y, standardize=FALSE, intercept=FALSE, thresh=1.e-10) estimate = coef(fit, s=lam, exact=TRUE, x=X, y=y)[-1] - fit.cv = cv.glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) - estimate.1se = coef(fit.cv, s='lambda.1se', exact=TRUE, x=X, y=y)[-1] - estimate.min = coef(fit.cv, s='lambda.min', exact=TRUE, x=X, y=y)[-1] + fit.cv = cv.glmnet(X, y, standardize=FALSE, intercept=FALSE, thresh=1.e-10) + estimate.1se = coef(fit, s=fit.cv$lambda.1se, exact=TRUE, x=X, y=y)[-1] + estimate.min = coef(fit, s=fit.cv$lambda.min, exact=TRUE, x=X, y=y)[-1] return(list(estimate = estimate, estimate.1se = estimate.1se, estimate.min = estimate.min, lam.min = fit.cv$lambda.min, lam.1se = fit.cv$lambda.1se)) }''') @@ -181,11 +181,12 @@ def glmnet_lasso(X, y, lambda_val): r_y = rpy.r.matrix(y, nrow=n, ncol=1) r_lam = rpy.r.matrix(lambda_val, nrow=1, ncol=1) - estimate = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate')) - estimate_1se = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.1se')) - estimate_min = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.min')) - lam_min = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.min'))) - lam_1se = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.1se'))) + val = lambda_R(r_X, r_y, r_lam) + estimate = np.array(val.rx2('estimate')) + estimate_1se = np.array(val.rx2('estimate.1se')) + estimate_min = np.array(val.rx2('estimate.min')) + lam_min = np.asscalar(np.array(val.rx2('lam.min'))) + lam_1se = np.asscalar(np.array(val.rx2('lam.1se'))) return estimate, estimate_1se, estimate_min, lam_min, lam_1se @@ -563,7 +564,7 @@ def comparison_cvmetrics_selected(n=500, sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator - MLE_intervals = np.asarray(result[['lower', 'upper']]) + MLE_intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) MLE_pval = np.asarray(result['pvalue']) randomized_lasso_est = randomized_lasso.initial_soln @@ -724,7 +725,7 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 df = lasso_Liu.summary(level=0.90, compute_intervals=True, dispersion=dispersion) Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \ np.asarray(df['upper_confidence']), \ - np.asarray(df['pval']) + np.asarray(df['pvalue']) Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval, Liu_target, beta[Lasso_soln_Liu != 0]) length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0]) @@ -783,7 +784,7 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator - MLE_intervals = np.asarray(result[['lower', 'upper']]) + MLE_intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) MLE_pval = np.asarray(result['pvalue']) randomized_lasso_est = randomized_lasso.initial_soln @@ -837,7 +838,7 @@ def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1 def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0.20, 0.31]), target="selected", tuning_nonrand="lambda.1se", tuning_rand="lambda.1se", - randomizing_scale = np.sqrt(0.50), ndraw=4, outpath = None, plot=True): + randomizing_scale = np.sqrt(0.50), ndraw=20, outpath = None, plot=True): df_selective_inference = pd.DataFrame() df_risk = pd.DataFrame() @@ -1003,7 +1004,6 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. df_selective_inference.to_html(outfile_inf_html) df_risk.to_html(outfile_risk_html) - stop if plot is True: plotRisk(df_risk) plotCoveragePower(df_selective_inference) @@ -1011,4 +1011,4 @@ def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0. if __name__ == "__main__": main() - main(target="full") + From 7fd336a94086b23e86a18dc8ab4865af95e86b43 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 23:08:02 -0700 Subject: [PATCH 049/187] script replaced by a notebook in compare-selection --- selectinf/randomized/tests/test_cv_mle.py | 1014 --------------------- 1 file changed, 1014 deletions(-) delete mode 100644 selectinf/randomized/tests/test_cv_mle.py diff --git a/selectinf/randomized/tests/test_cv_mle.py b/selectinf/randomized/tests/test_cv_mle.py deleted file mode 100644 index 30b8da4c5..000000000 --- a/selectinf/randomized/tests/test_cv_mle.py +++ /dev/null @@ -1,1014 +0,0 @@ -from __future__ import division - -import numpy as np, os, itertools -import pandas as pd - -import rpy2.robjects as rpy -from rpy2.robjects import numpy2ri -rpy.numpy2ri.activate() -from rpy2.robjects import pandas2ri -from rpy2.robjects.conversion import localconverter - -from scipy.stats import norm as ndist -from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets -from selectinf.algorithms.lasso import ROSI - -def sim_xy(n, p, nval, rho=0, s=5, beta_type=2, snr=1): - - rpy.r(''' - - #' Predictors and responses generation. - #' - #' Generate a predictor matrix x, and response vector y, following a specified - #' setup. Actually, two pairs of predictors and responses are generated: - #' one for training, and one for validation. - #' - #' @param n,p The number of training observations, and the number of predictors. - #' @param nval The number of validation observations. - #' @param rho Parameter that drives pairwise correlations of the predictor - #' variables; specifically, predictors i and j have population correlation - #' rho^abs(i-j). Default is 0. - #' @param s number of nonzero coefficients in the underlying regression model. - #' Default is 5. (Ignored if beta.type is 4, in which case the number of - #' nonzero coefficients is 6; and if beta.type is 5, it is interpreted as a - #' the number of strongly nonzero coefficients in a weak sparsity model.) - #' @param beta.type Integer taking values in between 1 and 5, used to specify - #' the pattern of nonzero coefficients in the underlying regression model; see - #' details below. Default is 1. - #' @param snr Desired signal-to-noise ratio (SNR), i.e., var(mu)/sigma^2 where - #' mu is mean and sigma^2 is the error variance. The error variance is set so - #' that the given SNR is achieved. Default is 1. - #' @return A list with the following components: x, y, xval, yval, Sigma, beta, - #' and sigma. - #' - #' @details The data model is: \eqn{Y \sim N(X\beta, \sigma^2 I)}. - #' The predictor variables have covariance matrix Sigma, with (i,j)th entry - #' rho^abs(i-j). The error variance sigma^2 is set according to the desired - #' signal-to-noise ratio. The first 4 options for the nonzero pattern - #' of the underlying regression coefficients beta follow the simulation setup - #' in Bertsimas, King, and Mazumder (2016), and the 5th is a weak sparsity - #' option: - #' \itemize{ - #' \item 1: beta has s components of 1, occurring at (roughly) equally-spaced - #' indices in between 1 and p - #' \item 2: beta has its first s components equal to 1 - #' \item 3: beta has its first s components taking nonzero values, where the - #' decay in a linear fashion from 10 to 0.5 - #' \item 4: beta has its first 6 components taking the nonzero values -10,-6, - #' -2,2,6,10 - #' \item 5: beta has its first s components equal to 1, and the rest decaying - #' to zero at an exponential rate - #' } - #' - #' @author Trevor Hastie, Rob Tibshirani, Ryan Tibshirani - #' @references Simulation setup based on "Best subset selection via a modern - #' optimization lens" by Dimitris Bertsimas, Angela King, and Rahul Mazumder, - #' Annals of Statistics, 44(2), 813-852, 2016. - #' @example examples/ex.fs.R - #' @export sim.xy - - sim.xy = function(n, p, nval, rho=0, s=5, beta.type=1, snr=1) { - # Generate predictors - x = matrix(rnorm(n*p),n,p) - xval = matrix(rnorm(nval*p),nval,p) - - # Introduce autocorrelation, if needed - if (rho != 0) { - inds = 1:p - Sigma = rho^abs(outer(inds, inds, "-")) - obj = svd(Sigma) - Sigma.half = obj$u %*% (sqrt(diag(obj$d))) %*% t(obj$v) - x = x %*% Sigma.half - xval = xval %*% Sigma.half - } - else Sigma = diag(1,p) - - # Generate underlying coefficients - s = min(s,p) - beta = rep(0,p) - if (beta.type==1) { - beta[round(seq(1,p,length=s))] = 1 - } else if (beta.type==2) { - beta[1:s] = 1 - } else if (beta.type==3) { - beta[1:s] = seq(10,0.5,length=s) - } else if (beta.type==4) { - beta[1:6] = c(-10,-6,-2,2,6,10) - } else { - beta[1:s] = 1 - beta[(s+1):p] = 0.5^(1:(p-s)) - } - - # Set snr based on sample variance on infinitely large test set - vmu = as.numeric(t(beta) %*% Sigma %*% beta) - sigma = sqrt(vmu/snr) - - # Generate responses - y = as.numeric(x %*% beta + rnorm(n)*sigma) - yval = as.numeric(xval %*% beta + rnorm(nval)*sigma) - - list(x=x,y=y,xval=xval,yval=yval,Sigma=Sigma,beta=beta,sigma=sigma) - } - - sim_xy = sim.xy - ''') - - r_simulate = rpy.globalenv['sim_xy'] - sim = r_simulate(n, p, nval, rho, s, beta_type, snr) - X = np.array(sim.rx2('x')) - y = np.array(sim.rx2('y')) - X_val = np.array(sim.rx2('xval')) - y_val = np.array(sim.rx2('yval')) - Sigma = np.array(sim.rx2('Sigma')) - beta = np.array(sim.rx2('beta')) - sigma = np.array(sim.rx2('sigma')) - - return X, y, X_val, y_val, Sigma, beta, sigma - - -def selInf_R(X, y, beta, lam, sigma, Type, alpha=0.1): - rpy.r(''' - library("selectiveInference") - selInf = function(X, y, beta, lam, sigma, Type, alpha= 0.1){ - y = as.matrix(y) - X = as.matrix(X) - beta = as.matrix(beta) - lam = as.matrix(lam)[1,1] - sigma = as.matrix(sigma)[1,1] - Type = as.matrix(Type)[1,1] - if(Type == 1){ - type = "full"} else{ - type = "partial"} - inf = fixedLassoInf(x = X, y = y, beta = beta, lambda=lam, family = "gaussian", - intercept=FALSE, sigma=sigma, alpha=alpha, type=type) - return(list(ci = inf$ci, pvalue = inf$pv))} - ''') - - inf_R = rpy.globalenv['selInf'] - n, p = X.shape - r_X = rpy.r.matrix(X, nrow=n, ncol=p) - r_y = rpy.r.matrix(y, nrow=n, ncol=1) - r_beta = rpy.r.matrix(beta, nrow=p, ncol=1) - r_lam = rpy.r.matrix(lam, nrow=1, ncol=1) - r_sigma = rpy.r.matrix(sigma, nrow=1, ncol=1) - r_Type = rpy.r.matrix(Type, nrow=1, ncol=1) - output = inf_R(r_X, r_y, r_beta, r_lam, r_sigma, r_Type) - ci = np.array(output.rx2('ci')) - pvalue = np.array(output.rx2('pvalue')) - return ci, pvalue - - -def glmnet_lasso(X, y, lambda_val): - rpy.r(''' - library(glmnet) - glmnet_LASSO = function(X,y, lambda){ - y = as.matrix(y) - X = as.matrix(X) - lam = as.matrix(lambda)[1,1] - n = nrow(X) - - fit = glmnet(X, y, standardize=FALSE, intercept=FALSE, thresh=1.e-10) - estimate = coef(fit, s=lam, exact=TRUE, x=X, y=y)[-1] - fit.cv = cv.glmnet(X, y, standardize=FALSE, intercept=FALSE, thresh=1.e-10) - estimate.1se = coef(fit, s=fit.cv$lambda.1se, exact=TRUE, x=X, y=y)[-1] - estimate.min = coef(fit, s=fit.cv$lambda.min, exact=TRUE, x=X, y=y)[-1] - return(list(estimate = estimate, estimate.1se = estimate.1se, estimate.min = estimate.min, lam.min = fit.cv$lambda.min, lam.1se = fit.cv$lambda.1se)) - }''') - - lambda_R = rpy.globalenv['glmnet_LASSO'] - n, p = X.shape - r_X = rpy.r.matrix(X, nrow=n, ncol=p) - r_y = rpy.r.matrix(y, nrow=n, ncol=1) - r_lam = rpy.r.matrix(lambda_val, nrow=1, ncol=1) - - val = lambda_R(r_X, r_y, r_lam) - estimate = np.array(val.rx2('estimate')) - estimate_1se = np.array(val.rx2('estimate.1se')) - estimate_min = np.array(val.rx2('estimate.min')) - lam_min = np.asscalar(np.array(val.rx2('lam.min'))) - lam_1se = np.asscalar(np.array(val.rx2('lam.1se'))) - return estimate, estimate_1se, estimate_min, lam_min, lam_1se - - -def coverage(intervals, pval, target, truth): - pval_alt = (pval[truth != 0]) < 0.1 - if pval_alt.sum() > 0: - avg_power = np.mean(pval_alt) - else: - avg_power = 0. - return np.mean((target > intervals[:, 0]) * (target < intervals[:, 1])), avg_power - - -def BHfilter(pval, q=0.2): - rpy.r.assign('pval', pval) - rpy.r.assign('q', q) - rpy.r('Pval = p.adjust(pval, method="BH")') - rpy.r('S = which((Pval < q)) - 1') - S = rpy.r('S') - ind = np.zeros(pval.shape[0], np.bool) - ind[np.asarray(S, np.int)] = 1 - return ind - - -def relative_risk(est, truth, Sigma): - if (truth != 0).sum() > 0: - return (est - truth).T.dot(Sigma).dot(est - truth) / truth.T.dot(Sigma).dot(truth) - else: - return (est - truth).T.dot(Sigma).dot(est - truth) - -from rpy2 import robjects - -def plotRisk(df_risk): - robjects.r(""" - library("ggplot2") - library("magrittr") - library("tidyr") - library("dplyr") - - plot_risk <- function(df_risk, outpath="plots/", resolution=300, height= 7.5, width=15) - { - date = 1:length(unique(df_risk$snr)) - df_risk = filter(df_risk, metric == "Full") - df = cbind(df_risk, date) - risk = df %>% - gather(key, value, sel.MLE, rand.LASSO, LASSO) %>% - ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + - geom_point(size=3) + - geom_line(aes(linetype=key), size=1) + - ylim(0.01,1.2)+ - labs(y="relative risk", x = "Signal regimes: snr") + - scale_x_continuous(breaks=1:length(unique(df_risk$snr)), label = sapply(df_risk$snr, toString)) + - theme(legend.position="top", legend.title = element_blank()) - indices = sort(c("sel.MLE", "rand.LASSO", "LASSO"), index.return= TRUE)$ix - names = c("sel-MLE", "rand-LASSO", "LASSO") - risk = risk + scale_color_manual(labels = names[indices], values=c("#008B8B", "#104E8B","#B22222")[indices]) + - scale_shape_manual(labels = names[indices], values=c(15, 17, 16)[indices]) + - scale_linetype_manual(labels = names[indices], values = c(1,1,2)[indices]) - outfile = paste(outpath, 'risk.png', sep="") - outfile = paste(outpath, 'risk.png', sep="") - ggsave(outfile, plot = risk, dpi=resolution, dev='png', height=height, width=width, units="cm")} - """) - - #pandas2ri.activate() - with localconverter(robjects.default_converter + pandas2ri.converter): - r_df_risk = robjects.conversion.py2rpy(df_risk) - R_plot = robjects.globalenv['plot_risk'] - R_plot(r_df_risk) - - -def plotCoveragePower(df_inference): - robjects.r(""" - library("ggplot2") - library("magrittr") - library("tidyr") - library("reshape") - library("cowplot") - library("dplyr") - - plot_coverage_lengths <- function(df_inference, outpath="plots/", - resolution=200, height_plot1= 6.5, width_plot1=12, - height_plot2=13, width_plot2=13) - { - snr.len = length(unique(df_inference$snr)) - df_inference = arrange(df_inference, method) - target = toString(df_inference$target[1]) - df = data.frame(snr = sapply(unique(df_inference$snr), toString), - MLE = 100*df_inference$coverage[((2*snr.len)+1):(3*snr.len)], - Lee = 100*df_inference$coverage[1:snr.len], - Naive = 100*df_inference$coverage[((3*snr.len)+1):(4*snr.len)]) - if(target== "selected"){ - data.m <- melt(df, id.vars='snr') - coverage = ggplot(data.m, aes(snr, value)) + - geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + - geom_hline(yintercept = 90, linetype="dotted") + - labs(y="coverage: partial", x = "Signal regimes: snr") + - theme(legend.position="top", - legend.title = element_blank()) - coverage = coverage + - scale_fill_manual(labels = c("MLE-based","Lee", "Naive"), values=c("#008B8B", "#B22222", "#FF6347"))} else{ - df = cbind(df, Liu = 100*df_inference$coverage[((snr.len)+1):(2*snr.len)]) - df <- df[c("snr", "MLE", "Liu", "Lee", "Naive")] - data.m <- melt(df, id.vars='snr') - coverage = ggplot(data.m, aes(snr, value)) + - geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + - geom_hline(yintercept = 90, linetype="dotted") + - labs(y="coverage: full", x = "Signal regimes: snr") + - theme(legend.position="top", legend.title = element_blank()) - coverage = coverage + - scale_fill_manual(labels = c("MLE-based", "Liu", "Lee", "Naive"), values=c("#008B8B", "#104E8B", "#B22222", "#FF6347"))} - - outfile = paste(outpath, 'coverage.png', sep="") - ggsave(outfile, plot = coverage, dpi=resolution, dev='png', height=height_plot1, width=width_plot1, units="cm") - - df = data.frame(snr = sapply(unique(df_inference$snr), toString), - MLE = 100*df_inference$sel.power[((2*snr.len)+1):(3*snr.len)], - Lee = 100*df_inference$sel.power[1:snr.len]) - if(target== "selected"){ - data.m <- melt(df, id.vars='snr') - sel_power = ggplot(data.m, aes(snr, value)) + - geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + - labs(y="power: partial", x = "Signal regimes: snr") + - theme(legend.position="top", legend.title = element_blank()) - sel_power = sel_power + scale_fill_manual(labels = c("MLE-based","Lee"), values=c("#008B8B", "#B22222"))} else{ - df = cbind(df, Liu = 100*df_inference$sel.power[((snr.len)+1):(2*snr.len)]) - df <- df[,c("snr", "MLE", "Liu", "Lee")] - data.m <- melt(df, id.vars='snr') - sel_power = ggplot(data.m, aes(snr, value)) + - geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + - labs(y="power: full", x = "Signal regimes: snr") + - theme(legend.position="top", legend.title = element_blank()) - sel_power = sel_power + scale_fill_manual(labels = c("MLE-based","Liu","Lee"), values=c("#008B8B", "#104E8B", "#B22222"))} - - outfile = paste(outpath, 'selective_power.png', sep="") - ggsave(outfile, plot = sel_power, dpi=resolution, dev='png', height=height_plot1, width=width_plot1, units="cm") - - if(target== "selected"){ - test_data <-data.frame(MLE = filter(df_inference, method == "MLE")$length, - Lee = filter(df_inference, method == "Lee")$length, - Naive = filter(df_inference, method == "Naive")$length, - date = 1:length(unique(df_inference$snr))) - lengths = test_data %>% - gather(key, value, MLE, Lee, Naive) %>% - ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + - geom_point(size=3) + - geom_line(aes(linetype=key), size=1) + - ylim(0.,max(test_data$MLE, test_data$Lee, test_data$Naive) + 0.2)+ - labs(y="lengths:partial", x = "Signal regimes: snr") + - scale_x_continuous(breaks=1:length(unique(df_inference$snr)), label = sapply(unique(df_inference$snr), toString))+ - theme(legend.position="top", legend.title = element_blank()) - - indices = sort(c("MLE", "Lee", "Naive"), index.return= TRUE)$ix - names = c("MLE-based", "Lee", "Naive") - lengths = lengths + scale_color_manual(labels = names[indices], values=c("#008B8B","#B22222", "#FF6347")[indices]) + - scale_shape_manual(labels = names[indices], values=c(15, 17, 16)[indices]) + - scale_linetype_manual(labels = names[indices], values = c(1,1,2)[indices])} else{ - test_data <-data.frame(MLE = filter(df_inference, method == "MLE")$length, - Lee = filter(df_inference, method == "Lee")$length, - Naive = filter(df_inference, method == "Naive")$length, - Liu = filter(df_inference, method == "Liu")$length, - date = 1:length(unique(df_inference$snr))) - lengths= test_data %>% - gather(key, value, MLE, Lee, Naive, Liu) %>% - ggplot(aes(x=date, y=value, colour=key, shape=key, linetype=key)) + - geom_point(size=3) + - geom_line(aes(linetype=key), size=1) + - ylim(0.,max(test_data$MLE, test_data$Lee, test_data$Naive, test_data$Liu) + 0.2)+ - labs(y="lengths: full", x = "Signal regimes: snr") + - scale_x_continuous(breaks=1:length(unique(df_inference$snr)), label = sapply(unique(df_inference$snr), toString))+ - theme(legend.position="top", legend.title = element_blank()) - - indices = sort(c("MLE", "Liu", "Lee", "Naive"), index.return= TRUE)$ix - names = c("MLE-based", "Lee", "Naive", "Liu") - lengths = lengths + scale_color_manual(labels = names[indices], values=c("#008B8B","#B22222", "#FF6347", "#104E8B")[indices]) + - scale_shape_manual(labels = names[indices], values=c(15, 17, 16, 15)[indices]) + - scale_linetype_manual(labels = names[indices], values = c(1,1,2,1)[indices])} - - prop = filter(df_inference, method == "Lee")$prop.infty - df = data.frame(snr = sapply(unique(df_inference$snr), toString), - infinite = 100*prop) - data.prop <- melt(df, id.vars='snr') - pL = ggplot(data.prop, aes(snr, value)) + - geom_bar(aes(fill = variable), width = 0.4, position = position_dodge(width=0.5), stat="identity") + - labs(y="infinite intervals (%)", x = "Signal regimes: snr") + - theme(legend.position="top", - legend.title = element_blank()) - pL = pL + scale_fill_manual(labels = c("Lee"), values=c("#B22222")) - prow <- plot_grid( pL + theme(legend.position="none"), - lengths + theme(legend.position="none"), - align = 'vh', - hjust = -1, - ncol = 1) - - legend <- get_legend(lengths+ theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom")) - p <- plot_grid(prow, ncol=1, legend, rel_heights = c(2., .2)) - outfile = paste(outpath, 'length.png', sep="") - ggsave(outfile, plot = p, dpi=resolution, dev='png', height=height_plot2, width=width_plot2, units="cm")} - """) - - #pandas2ri.activate() - with localconverter(robjects.default_converter + pandas2ri.converter): - r_df_inference = robjects.conversion.py2rpy(df_inference) - R_plot = robjects.globalenv['plot_coverage_lengths'] - R_plot(r_df_inference) - -def comparison_cvmetrics_selected(n=500, - p=100, - nval=500, - rho=0.35, - s=5, - beta_type=1, - snr=0.20, - randomizer_scale=np.sqrt(0.50), - full_dispersion=True, - tuning_nonrand="lambda.min", - tuning_rand="lambda.1se"): - - (X, - y, - _, - _, - Sigma, - beta, - sigma) = sim_xy(n=n, - p=p, - nval=nval, - rho=rho, - s=s, - beta_type=beta_type, - snr=snr) - true_mean = X.dot(beta) - - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - sigma_ = np.std(y) - print("estimated and true sigma", sigma, sigma_) - - lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, - np.random.standard_normal((n, - 2000)))).max(0)) - (glm_LASSO_theory, - glm_LASSO_1se, - glm_LASSO_min, - lam_min, - lam_1se) = glmnet_lasso(X, - y, - lam_theory/float(n)) - - if tuning_nonrand == "lambda.min": - lam_LASSO = lam_min - glm_LASSO = glm_LASSO_min - elif tuning_nonrand == "lambda.1se": - lam_LASSO = lam_1se - glm_LASSO = glm_LASSO_1se - else: - lam_LASSO = lam_theory/float(n) - glm_LASSO = glm_LASSO_theory - - active_LASSO = (glm_LASSO != 0) - nactive_LASSO = active_LASSO.sum() - active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) - active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], np.bool) - - rel_LASSO = np.zeros(p) - Lee_nreport = 0 - bias_Lee = 0. - bias_naive = 0. - - if nactive_LASSO > 0: - post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) - rel_LASSO[active_LASSO] = post_LASSO_OLS - Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) - try: - Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=0, alpha=0.1) - except: - Lee_intervals, Lee_pval = np.array([]), np.array([]) - - if (Lee_pval.shape[0] == Lee_target.shape[0]): - - cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) - inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) - inf_entries = np.mean(inf_entries_bool) - if inf_entries == 1.: - length_Lee = 0. - else: - length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) - power_Lee = ((active_LASSO_bool) * (np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))) \ - .sum() / float((beta != 0).sum()) - Lee_discoveries = BHfilter(Lee_pval, q=0.1) - power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(max(Lee_discoveries.sum(), 1.)) - bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) - - naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) - naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, - post_LASSO_OLS + 1.65 * naive_sd]).T - naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) - cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) - length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) - power_naive = ((active_LASSO_bool) * ( - np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( - (beta != 0).sum()) - naive_discoveries = BHfilter(naive_pval, q=0.1) - power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) - bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) - - partial_Lasso_risk = (glm_LASSO[active_LASSO]-Lee_target).T.dot(glm_LASSO[active_LASSO]-Lee_target) - partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) - - else: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - elif nactive_LASSO == 0: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_min * np.ones(p), - randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_1se * np.ones(p), - randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) - else: - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - signs = randomized_lasso.fit() - nonzero = signs != 0 - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) - sel_MLE = np.zeros(p) - ind_est = np.zeros(p) - randomized_lasso_est = np.zeros(p) - randomized_rel_lasso_est = np.zeros(p) - MLE_nreport = 0 - - if nonzero.sum() > 0: - target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - - result = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - - MLE_estimate = result['MLE'] - ind_unbiased_estimator = result['unbiased'] - - sel_MLE[nonzero] = MLE_estimate - ind_est[nonzero] = ind_unbiased_estimator - MLE_intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) - MLE_pval = np.asarray(result['pvalue']) - - randomized_lasso_est = randomized_lasso.initial_soln - randomized_rel_lasso_est = randomized_lasso._beta_full - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * ( - np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) - partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) - partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) - partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) - - else: - MLE_nreport = 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] - MLE_discoveries = np.zeros(1) - partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] - - risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), - relative_risk(ind_est, beta, Sigma), - relative_risk(randomized_lasso_est, beta, Sigma), - relative_risk(randomized_rel_lasso_est, beta, Sigma), - relative_risk(rel_LASSO, beta, Sigma), - relative_risk(glm_LASSO, beta, Sigma))) - - partial_risks = np.vstack((partial_MLE_risk, - partial_ind_risk, - partial_randLasso_risk, - partial_relrandLasso_risk, - partial_relLasso_risk, - partial_Lasso_risk)) - - naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, - naive_discoveries.sum())) - Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH, - Lee_discoveries.sum())) - Liu_inf = np.zeros((10, 1)) - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, - MLE_discoveries.sum())) - nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) - - return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) - - -def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, - randomizer_scale=np.sqrt(0.25), full_dispersion=True, - tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): - - X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) - print("snr", snr) - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - sigma_ = np.std(y) - print("estimated and true sigma", sigma, sigma_) - - lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) - glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory/float(n)) - if tuning_nonrand == "lambda.min": - lam_LASSO = lam_min - glm_LASSO = glm_LASSO_min - elif tuning_nonrand == "lambda.1se": - lam_LASSO = lam_1se - glm_LASSO = glm_LASSO_1se - else: - lam_LASSO = lam_theory/float(n) - glm_LASSO = glm_LASSO_theory - - active_LASSO = (glm_LASSO != 0) - nactive_LASSO = active_LASSO.sum() - active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) - active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], - np.bool) - - rel_LASSO = np.zeros(p) - Lee_nreport = 0 - bias_Lee = 0. - bias_naive = 0. - - if nactive_LASSO > 0: - rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) - Lee_target = beta[active_LASSO] - Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=1, alpha=0.1) - - if (Lee_pval.shape[0] == Lee_target.shape[0]): - - cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) - inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) - inf_entries = np.mean(inf_entries_bool) - if inf_entries == 1.: - length_Lee = 0. - else: - length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) - power_Lee = ((active_LASSO_bool) * ( - np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))).sum() / float((beta != 0).sum()) - Lee_discoveries = BHfilter(Lee_pval, q=0.1) - power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(max(Lee_discoveries.sum(), 1.)) - bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) - - post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) - naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) - naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, - post_LASSO_OLS + 1.65 * naive_sd]).T - naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) - cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) - length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) - power_naive = ((active_LASSO_bool) * ( - np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( - (beta != 0).sum()) - naive_discoveries = BHfilter(naive_pval, q=0.1) - power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) - bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) - - partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) - partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) - else: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - elif nactive_LASSO == 0: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - lasso_Liu = ROSI.gaussian(X, y, n * lam_LASSO) - print(type(lasso_Liu)) - Lasso_soln_Liu = lasso_Liu.fit() - active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0] - nactive_Liu = active_set_Liu.shape[0] - active_Liu_bool = np.asarray([(np.in1d(active_set_Liu[a], true_set).sum() > 0) for a in range(nactive_Liu)], np.bool) - Liu_nreport = 0 - - if nactive_Liu > 0: - Liu_target = beta[Lasso_soln_Liu != 0] - df = lasso_Liu.summary(level=0.90, compute_intervals=True, dispersion=dispersion) - Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \ - np.asarray(df['upper_confidence']), \ - np.asarray(df['pvalue']) - Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T - cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval, Liu_target, beta[Lasso_soln_Liu != 0]) - length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0]) - power_Liu = ((active_Liu_bool) * (np.logical_or((0. < Liu_intervals[:, 0]), - (0. > Liu_intervals[:, 1])))).sum() / float((beta != 0).sum()) - Liu_discoveries = BHfilter(Liu_pval, q=0.1) - power_Liu_BH = (Liu_discoveries * active_Liu_bool).sum() / float((beta != 0).sum()) - fdr_Liu_BH = (Liu_discoveries * ~active_Liu_bool).sum() / float(max(Liu_discoveries.sum(), 1.)) - - else: - Liu_nreport = 1 - cov_Liu, length_Liu, power_Liu, power_Liu_BH, fdr_Liu_BH, selective_Liu_power = [0., 0., 0., 0., 0., 0.] - Liu_discoveries = np.zeros(1) - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= n * lam_min * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= n * lam_1se * np.ones(p), - randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) - else: - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - signs = randomized_lasso.fit() - nonzero = signs != 0 - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) - sel_MLE = np.zeros(p) - ind_est = np.zeros(p) - randomized_lasso_est = np.zeros(p) - randomized_rel_lasso_est = np.zeros(p) - MLE_nreport = 0 - - if nonzero.sum() > 0: - target_randomized = beta[nonzero] - (observed_target, - cov_target, - cov_target_score, - alternatives) = full_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - - result = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - - MLE_estimate = result['MLE'] - ind_unbiased_estimator = result['unbiased'] - - sel_MLE[nonzero] = MLE_estimate - ind_est[nonzero] = ind_unbiased_estimator - MLE_intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) - MLE_pval = np.asarray(result['pvalue']) - - randomized_lasso_est = randomized_lasso.initial_soln - randomized_rel_lasso_est = randomized_lasso._beta_full - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * (np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) - partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) - partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) - partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) - else: - MLE_nreport = 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] - MLE_discoveries = np.zeros(1) - partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] - - risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), - relative_risk(ind_est, beta, Sigma), - relative_risk(randomized_lasso_est, beta, Sigma), - relative_risk(randomized_rel_lasso_est, beta, Sigma), - relative_risk(rel_LASSO, beta, Sigma), - relative_risk(glm_LASSO, beta, Sigma))) - - partial_risks = np.vstack((partial_MLE_risk, - partial_ind_risk, - partial_randLasso_risk, - partial_relrandLasso_risk, - partial_relLasso_risk, - partial_Lasso_risk)) - - naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, - power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) - Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, - power_Lee, power_Lee_BH, fdr_Lee_BH, Lee_discoveries.sum())) - Liu_inf = np.vstack((cov_Liu, length_Liu, 0., nactive_Liu, bias_Lee, selective_Liu_power, - power_Liu, power_Liu_BH, fdr_Liu_BH, Liu_discoveries.sum())) - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, - power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) - nreport = np.vstack((Lee_nreport, Liu_nreport, MLE_nreport)) - - return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) - - - -def main(n=500, p=100, rho=0.35, s=5, beta_type=1, snr_values=np.array([0.15, 0.20, 0.31]), - target="selected", tuning_nonrand="lambda.1se", tuning_rand="lambda.1se", - randomizing_scale = np.sqrt(0.50), ndraw=20, outpath = None, plot=True): - - df_selective_inference = pd.DataFrame() - df_risk = pd.DataFrame() - - if n > p: - full_dispersion = True - else: - full_dispersion = False - - snr_list = [] - snr_list_0 = [] - for snr in snr_values: - snr_list.append(snr*np.ones(4)) - snr_list_0.append(snr*np.ones(2)) - output_overall = np.zeros(55) - if target == "selected": - for i in range(ndraw): - output_overall += np.squeeze(comparison_cvmetrics_selected(n=n, - p=p, - nval=n, - rho=rho, - s=s, - beta_type=beta_type, - snr=snr, - randomizer_scale=randomizing_scale, - full_dispersion=full_dispersion, - tuning_nonrand =tuning_nonrand, - tuning_rand=tuning_rand)) - elif target == "full": - for i in range(ndraw): - output_overall += np.squeeze(comparison_cvmetrics_full(n=n, - p=p, - nval=n, - rho=rho, - s=s, - beta_type=beta_type, - snr=snr, - randomizer_scale=randomizing_scale, - full_dispersion=full_dispersion, - tuning_nonrand =tuning_nonrand, - tuning_rand=tuning_rand)) - - nLee = output_overall[52] - nLiu = output_overall[53] - nMLE = output_overall[54] - - relative_risk = (output_overall[0:6] / float(ndraw)).reshape((1, 6)) - partial_risk = np.hstack(((output_overall[46:50] / float(ndraw-nMLE)).reshape((1, 4)), - (output_overall[50:52] / float(ndraw - nLee)).reshape((1, 2)))) - - nonrandomized_naive_inf = np.hstack(((output_overall[6:12] / float(ndraw - nLee)).reshape((1, 6)), - (output_overall[12:16] / float(ndraw)).reshape((1, 4)))) - nonrandomized_Lee_inf = np.hstack(((output_overall[16:22] / float(ndraw - nLee)).reshape((1, 6)), - (output_overall[22:26] / float(ndraw)).reshape((1, 4)))) - nonrandomized_Liu_inf = np.hstack(((output_overall[26:32] / float(ndraw - nLiu)).reshape((1, 6)), - (output_overall[32:36] / float(ndraw)).reshape((1, 4)))) - randomized_MLE_inf = np.hstack(((output_overall[36:42] / float(ndraw - nMLE)).reshape((1, 6)), - (output_overall[42:46] / float(ndraw)).reshape((1, 4)))) - - if target=="selected": - nonrandomized_Liu_inf[nonrandomized_Liu_inf==0] = 'NaN' - if target == "debiased": - nonrandomized_Liu_inf[nonrandomized_Liu_inf == 0] = 'NaN' - nonrandomized_Lee_inf[nonrandomized_Lee_inf == 0] = 'NaN' - - df_naive = pd.DataFrame(data=nonrandomized_naive_inf,columns=['coverage', - 'length', - 'prop-infty', - 'tot-active', - 'bias', - 'sel-power', - 'power', - 'power-BH', - 'fdr-BH', - 'tot-discoveries']) - df_naive['method'] = "Naive" - df_Lee = pd.DataFrame(data=nonrandomized_Lee_inf, columns=['coverage', - 'length', - 'prop-infty', - 'tot-active', - 'bias', - 'sel-power', - 'power', - 'power-BH', - 'fdr-BH', - 'tot-discoveries']) - df_Lee['method'] = "Lee" - - df_Liu = pd.DataFrame(data=nonrandomized_Liu_inf,columns=['coverage', - 'length', - 'prop-infty', - 'tot-active', - 'bias', - 'sel-power', - 'power', - 'power-BH', - 'fdr-BH', - 'tot-discoveries']) - df_Liu['method'] = "Liu" - - df_MLE = pd.DataFrame(data=randomized_MLE_inf, columns=['coverage', - 'length', - 'prop-infty', - 'tot-active', - 'bias', - 'sel-power', - 'power', - 'power-BH', - 'fdr-BH', - 'tot-discoveries']) - df_MLE['method'] = "MLE" - - df_risk_metrics = pd.DataFrame(data=relative_risk, columns=['sel-MLE', - 'ind-est', - 'rand-LASSO', - 'rel-rand-LASSO', - 'rel-LASSO', - 'LASSO']) - df_risk_metrics['metric'] = "Full" - df_prisk_metrics = pd.DataFrame(data=partial_risk,columns=['sel-MLE', - 'ind-est', - 'rand-LASSO', - 'rel-rand-LASSO', - 'rel-LASSO', - 'LASSO']) - df_prisk_metrics['metric'] = "Partial" - - df_selective_inference = df_selective_inference.append(df_naive, ignore_index=True) - df_selective_inference = df_selective_inference.append(df_Lee, ignore_index=True) - df_selective_inference = df_selective_inference.append(df_Liu, ignore_index=True) - df_selective_inference = df_selective_inference.append(df_MLE, ignore_index=True) - - df_risk = df_risk.append(df_risk_metrics, ignore_index=True) - df_risk = df_risk.append(df_prisk_metrics, ignore_index=True) - - snr_list = list(itertools.chain.from_iterable(snr_list)) - df_selective_inference['n'] = n - df_selective_inference['p'] = p - df_selective_inference['s'] = s - df_selective_inference['rho'] = rho - df_selective_inference['beta-type'] = beta_type - df_selective_inference['snr'] = pd.Series(np.asarray(snr_list)) - df_selective_inference['target'] = target - - snr_list_0 = list(itertools.chain.from_iterable(snr_list_0)) - df_risk['n'] = n - df_risk['p'] = p - df_risk['s'] = s - df_risk['rho'] = rho - df_risk['beta-type'] = beta_type - df_risk['snr'] = pd.Series(np.asarray(snr_list_0)) - df_risk['target'] = target - - if outpath is None: - outpath = os.path.dirname(__file__) - - outfile_inf_csv = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_inference_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".csv") - outfile_risk_csv = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".csv") - outfile_inf_html = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_inference_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".html") - outfile_risk_html = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + target + "_rho_" + str(rho) + ".html") - df_selective_inference.to_csv(outfile_inf_csv, index=False) - df_risk.to_csv(outfile_risk_csv, index=False) - df_selective_inference.to_html(outfile_inf_html) - df_risk.to_html(outfile_risk_html) - - if plot is True: - plotRisk(df_risk) - plotCoveragePower(df_selective_inference) - - -if __name__ == "__main__": - main() - From 7835ac8ae1393b2dedd18dfaeef57d7545f74684 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 23:15:50 -0700 Subject: [PATCH 050/187] fixinf change of pval to pvalue --- selectinf/algorithms/tests/test_ROSI.py | 2 +- selectinf/algorithms/tests/test_compareR.py | 30 ++++++++++----------- selectinf/algorithms/tests/test_lasso.py | 18 ++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/selectinf/algorithms/tests/test_ROSI.py b/selectinf/algorithms/tests/test_ROSI.py index 886648221..9629de691 100644 --- a/selectinf/algorithms/tests/test_ROSI.py +++ b/selectinf/algorithms/tests/test_ROSI.py @@ -103,7 +103,7 @@ def test_modelQ(): LX.fit() SX = LX.summary(dispersion=1) - np.testing.assert_allclose(S['pval'], SX['pval'], rtol=1.e-5, atol=1.e-4) + np.testing.assert_allclose(S['pvalue'], SX['pvalue'], rtol=1.e-5, atol=1.e-4) diff --git a/selectinf/algorithms/tests/test_compareR.py b/selectinf/algorithms/tests/test_compareR.py index 3727fe548..58ac797cb 100644 --- a/selectinf/algorithms/tests/test_compareR.py +++ b/selectinf/algorithms/tests/test_compareR.py @@ -87,7 +87,7 @@ def test_fixed_lambda(): yield np.testing.assert_allclose, L.fit()[1:], beta_hat, 1.e-2, 1.e-2, False, 'fixed lambda, sigma=%f coef' % s yield np.testing.assert_equal, L.active, selected_vars - yield np.testing.assert_allclose, S['pval'], R_pvals, tol, tol, False, 'fixed lambda, sigma=%f pval' % s + yield np.testing.assert_allclose, S['pvalue'], R_pvals, tol, tol, False, 'fixed lambda, sigma=%f pval' % s yield np.testing.assert_allclose, S['sd'], sdvar, tol, tol, False, 'fixed lambda, sigma=%f sd ' % s yield np.testing.assert_allclose, S['onestep'], coef, tol, tol, False, 'fixed lambda, sigma=%f estimator' % s @@ -252,7 +252,7 @@ def test_coxph(): yield np.testing.assert_equal, np.array(L.active) + 1, selected_vars yield np.testing.assert_allclose, beta2, beta_hat, tol, tol, False, 'cox coeff' - yield np.testing.assert_allclose, L.summary('onesided')['pval'], R_pvals, tol, tol, False, 'cox pvalues' + yield np.testing.assert_allclose, L.summary('onesided')['pvalue'], R_pvals, tol, tol, False, 'cox pvalues' @np.testing.dec.skipif(not rpy2_available, msg="rpy2 not available, skipping test") def test_logistic(): @@ -311,7 +311,7 @@ def test_logistic(): yield np.testing.assert_equal, L.active[1:], selected_vars yield np.testing.assert_allclose, beta2, beta_hat, tol, tol, False, 'logistic coef' - yield np.testing.assert_allclose, L.summary('onesided')['pval'][1:], R_pvals, tol, tol, False, 'logistic pvalues' + yield np.testing.assert_allclose, L.summary('onesided')['pvalue'][1:], R_pvals, tol, tol, False, 'logistic pvalues' @np.testing.dec.skipif(not rpy2_available, msg="rpy2 not available, skipping test") @@ -554,8 +554,8 @@ def test_liu_gaussian(): active_set = rpy.r('active_vars') print(pvalues) - print(S['pval']) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + print(S['pvalue']) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break @@ -610,8 +610,8 @@ def test_liu_logistic(): pvalues = pvalues[~np.isnan(pvalues)] active_set = rpy.r('active_vars') print(pvalues) - print(S['pval']) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + print(S['pvalue']) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break @@ -669,9 +669,9 @@ def test_ROSI_gaussian_JM(): active_set = rpy.r('active_vars') print(pvalues) - print(np.asarray(S['pval'])) + print(np.asarray(S['pvalue'])) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break @@ -724,9 +724,9 @@ def test_ROSI_logistic_JM(): active_set = rpy.r('active_vars') print(pvalues) - print(np.asarray(S['pval'])) + print(np.asarray(S['pvalue'])) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break @@ -790,9 +790,9 @@ def test_ROSI_gaussian_BN(): active_set = rpy.r('active_vars') print(pvalues) - print(np.asarray(S['pval'])) + print(np.asarray(S['pvalue'])) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break @@ -846,9 +846,9 @@ def test_ROSI_logistic_BN(): active_set = rpy.r('active_vars') print(pvalues) - print(np.asarray(S['pval'])) + print(np.asarray(S['pvalue'])) - nt.assert_true(np.corrcoef(pvalues, S['pval'])[0,1] > 0.999) + nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0,1] > 0.999) numpy2ri.deactivate() break diff --git a/selectinf/algorithms/tests/test_lasso.py b/selectinf/algorithms/tests/test_lasso.py index 3b1a3186e..a64bd869d 100644 --- a/selectinf/algorithms/tests/test_lasso.py +++ b/selectinf/algorithms/tests/test_lasso.py @@ -115,7 +115,7 @@ def test_logistic(): np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) - P = L.summary()['pval'] + P = L.summary()['pvalue'] return L, C, P @@ -137,7 +137,7 @@ def test_poisson(): np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) - P = L.summary()['pval'] + P = L.summary()['pvalue'] return L, C, P @@ -162,7 +162,7 @@ def test_coxph(): np.dot(L.constraints.linear_part, L.onestep_estimator), L.constraints.offset) - P = L.summary()['pval'] + P = L.summary()['pvalue'] return L, C, P @@ -543,7 +543,7 @@ def test_gaussian_pvals(n=100, if set(true_active).issubset(L.active): S = L.summary('onesided') S = L.summary('twosided') - return S['pval'], [v in true_active for v in S['variable']] + return S['pvalue'], [v in true_active for v in S['variable']] @wait_for_return_value() def test_sqrt_lasso_pvals(n=100, @@ -572,7 +572,7 @@ def test_sqrt_lasso_pvals(n=100, if set(true_active).issubset(L.active): S = L.summary('onesided') S = L.summary('twosided') - return S['pval'], [v in true_active for v in S['variable']] + return S['pvalue'], [v in true_active for v in S['variable']] @wait_for_return_value() @@ -604,7 +604,7 @@ def test_sqrt_lasso_sandwich_pvals(n=200, if set(true_active).issubset(L_SQ.active): S = L_SQ.summary('twosided') - return S['pval'], [v in true_active for v in S['variable']] + return S['pvalue'], [v in true_active for v in S['variable']] @wait_for_return_value() def test_gaussian_sandwich_pvals(n=200, @@ -654,13 +654,13 @@ def test_gaussian_sandwich_pvals(n=200, if set(true_active).issubset(L_P.active): S = L_P.summary('twosided') - P_P = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active] + P_P = [p for p, v in zip(S['pvalue'], S['variable']) if v not in true_active] L_S = lasso.gaussian(X, y, feature_weights, covariance_estimator=sandwich) L_S.fit() S = L_S.summary('twosided') - P_S = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active] + P_S = [p for p, v in zip(S['pvalue'], S['variable']) if v not in true_active] return P_P, P_S, [v in true_active for v in S['variable']] @@ -693,7 +693,7 @@ def test_logistic_pvals(n=500, print(true_active, L.active) if set(true_active).issubset(L.active): - return S['pval'], [v in true_active for v in S['variable']] + return S['pvalue'], [v in true_active for v in S['variable']] @set_seed_iftrue(True) def test_adding_quadratic_lasso(): From 408f26efb020fcf2df0afac2f6d158e0582bc06d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 24 Jun 2020 23:21:45 -0700 Subject: [PATCH 051/187] change of lower/upper to lower_confidence/upper_confidence --- selectinf/randomized/tests/test_BH.py | 10 ++++++---- selectinf/randomized/tests/test_drop_losers.py | 12 ++++++------ .../randomized/tests/test_marginal_screening.py | 4 ++-- .../randomized/tests/test_selective_MLE_high.py | 6 +++--- selectinf/randomized/tests/test_slope.py | 4 ++-- selectinf/randomized/tests/test_topK.py | 6 +++--- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/selectinf/randomized/tests/test_BH.py b/selectinf/randomized/tests/test_BH.py index e581c6350..34c26ac5f 100644 --- a/selectinf/randomized/tests/test_BH.py +++ b/selectinf/randomized/tests/test_BH.py @@ -158,13 +158,15 @@ def test_BH(n=500, parameter=beta_target) pivots = np.asarray(result['pivot']) pval = np.asarray(result['pvalue']) - lower = np.asarray(result['lower']) - upper = np.asarray(result['upper']) + lower = np.asarray(result['lower_confidence']) + upper = np.asarray(result['upper_confidence']) print(pval) - print("beta_target and intervals", beta_target, result[['lower', 'upper']]) + print("beta_target and intervals", beta_target, result[['lower_confidence', + 'upper_confidence']]) coverage = (beta_target > lower) * (beta_target < upper) print("coverage for selected target", coverage.sum()/float(nonzero.sum())) - return pivots[beta_target == 0], pivots[beta_target != 0], coverage, result[['lower', 'upper']], pivots + return (pivots[beta_target == 0], pivots[beta_target != 0], coverage, + result[['lower_confidence', 'upper_confidence']], pivots) else: return [], [], [], [], [] diff --git a/selectinf/randomized/tests/test_drop_losers.py b/selectinf/randomized/tests/test_drop_losers.py index 6322f5e66..46f4b8395 100644 --- a/selectinf/randomized/tests/test_drop_losers.py +++ b/selectinf/randomized/tests/test_drop_losers.py @@ -45,8 +45,8 @@ def test_drop_losers(p=50, else: result = dtl.selective_MLE()[0] pvalue = np.asarray(result['pvalue']) - lower = np.asarray(result['lower']) - upper = np.asarray(result['upper']) + lower = np.asarray(result['lower_confidence']) + upper = np.asarray(result['upper_confidence']) cover = (lower < 0) * (upper > 0) return pvalue, cover @@ -155,8 +155,8 @@ def test_compare_topK(p=20, np.testing.assert_allclose(summary1['pvalue'], summary2['pvalue'], rtol=1.e-3) np.testing.assert_allclose(summary1['target'], summary2['target'], rtol=1.e-3) - np.testing.assert_allclose(summary1['lower'], summary2['lower'], rtol=1.e-3) - np.testing.assert_allclose(summary1['upper'], summary2['upper'], rtol=1.e-3) + np.testing.assert_allclose(summary1['lower_confidence'], summary2['lower_confidence'], rtol=1.e-3) + np.testing.assert_allclose(summary1['upper_confidence'], summary2['upper_confidence'], rtol=1.e-3) np.random.seed(0) degenerate_topK.fit(perturb=perturb2) @@ -170,8 +170,8 @@ def test_compare_topK(p=20, np.testing.assert_allclose(summary1['pvalue'], summary3['pvalue'], rtol=1.e-3) np.testing.assert_allclose(summary1['target'], summary3['target'], rtol=1.e-3) - np.testing.assert_allclose(summary1['lower'], summary3['lower'], rtol=1.e-3) - np.testing.assert_allclose(summary1['upper'], summary3['upper'], rtol=1.e-3) + np.testing.assert_allclose(summary1['lower_confidence'], summary3['lower_confidence'], rtol=1.e-3) + np.testing.assert_allclose(summary1['upper_confidence'], summary3['upper_confidence'], rtol=1.e-3) def main(nsim=100, use_MLE=True): diff --git a/selectinf/randomized/tests/test_marginal_screening.py b/selectinf/randomized/tests/test_marginal_screening.py index e416cdade..6db0fbdf2 100644 --- a/selectinf/randomized/tests/test_marginal_screening.py +++ b/selectinf/randomized/tests/test_marginal_screening.py @@ -68,7 +68,7 @@ def test_marginal(n=500, alternatives, compute_intervals=True) - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) pval = result['pvalue'] print(pval) if marginal: @@ -152,7 +152,7 @@ def test_simple(n=100, compute_intervals=True) pval = result['pvalue'] - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) print(pval) beta_target = cov_target.dot(true_mean[nonzero]) print("beta_target and intervals", beta_target, intervals) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 01df0630e..578ae66ec 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -74,7 +74,7 @@ def test_full_targets(n=200, cov_target_score)[0] pval = result['pvalue'] estimate = result['MLE'] - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) print("estimate, intervals", estimate, intervals) coverage = (beta[nonzero] > intervals[:, 0]) * (beta[nonzero] < intervals[:, 1]) @@ -142,7 +142,7 @@ def test_selected_targets(n=2000, cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -210,7 +210,7 @@ def test_instance(): cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) diff --git a/selectinf/randomized/tests/test_slope.py b/selectinf/randomized/tests/test_slope.py index 5c31a848f..bc3a475a7 100644 --- a/selectinf/randomized/tests/test_slope.py +++ b/selectinf/randomized/tests/test_slope.py @@ -183,8 +183,8 @@ def test_randomized_slope(n=2000, compute_intervals=True, ndraw=150000) pval = np.asarray(result['pvalue']) - lower = np.asarray(result['lower']) - upper = np.asarray(result['upper']) + lower = np.asarray(result['lower_confidence']) + upper = np.asarray(result['upper_confidence']) print(pd.DataFrame({'target':beta_target, 'lower':lower, diff --git a/selectinf/randomized/tests/test_topK.py b/selectinf/randomized/tests/test_topK.py index 83c7a6ac0..000c45aba 100644 --- a/selectinf/randomized/tests/test_topK.py +++ b/selectinf/randomized/tests/test_topK.py @@ -67,10 +67,10 @@ def test_topK(n=500, crosscov_target_score, alternatives, compute_intervals=True) - lower = np.asarray(result['lower']) - upper = np.asarray(result['upper']) + lower = np.asarray(result['lower_confidence']) + upper = np.asarray(result['upper_confidence']) pval = result['pvalue'] - intervals = np.asarray(result[['lower', 'upper']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) print(pval) if marginal: beta_target = true_mean[nonzero] From 915904378afdb93de6e2460b9cfe77d754341bc6 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Thu, 25 Jun 2020 13:44:49 -0400 Subject: [PATCH 052/187] commit changes to test_mle --- doc/adjusted_MLE/tests/comparison_metrics.py | 20 ++++++++++++++++++++ selectinf/randomized/tests/test_lasso.py | 2 ++ 2 files changed, 22 insertions(+) diff --git a/doc/adjusted_MLE/tests/comparison_metrics.py b/doc/adjusted_MLE/tests/comparison_metrics.py index c902ec879..15a003d0e 100644 --- a/doc/adjusted_MLE/tests/comparison_metrics.py +++ b/doc/adjusted_MLE/tests/comparison_metrics.py @@ -1,6 +1,26 @@ +<<<<<<< HEAD:doc/adjusted_MLE/tests/comparison_metrics.py from __future__ import division, print_function import numpy as np, sys, time from scipy.stats import norm as ndist +======= +import numpy as np, os, itertools +import pandas as pd + +import rpy2.robjects as rpy +from rpy2.robjects import numpy2ri +rpy.numpy2ri.activate() +from scipy.stats import norm as ndist + +from ..lasso import lasso, full_targets, selected_targets, debiased_targets +from ...algorithms.lasso import lasso as lasso_full + +def sim_xy(n, p, nval, rho=0, s=5, beta_type=2, snr=1): + + rpy.r(''' + source('~/best-subset/bestsubset/R/sim.R') + sim_xy = sim.xy + ''') +>>>>>>> commit changes to test_mle:selectinf/randomized/tests/test_cv_mle.py from rpy2 import robjects import rpy2.robjects.numpy2ri diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 01b5b110a..507a80d63 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -394,3 +394,5 @@ def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True): plt.show() +if __name__ == "__main__": + main() \ No newline at end of file From 6f546e96f897d1dc46a763dc6ee4e19a600cf61f Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Mon, 29 Jun 2020 02:21:17 -0400 Subject: [PATCH 053/187] add approx log reference --- selectinf/randomized/approx_reference.py | 139 +++++++++++++++++++++++ selectinf/randomized/query.py | 22 +++- 2 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 selectinf/randomized/approx_reference.py diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py new file mode 100644 index 000000000..79096b0c4 --- /dev/null +++ b/selectinf/randomized/approx_reference.py @@ -0,0 +1,139 @@ +from __future__ import division, print_function + +import numpy as np, sys +from scipy.stats import norm as ndist + +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C + + +class approximate_grid_inference(): + + def __init__(self, + query, + observed_target, + cov_target, + cov_target_score, + grid, + dispersion=1, + level=0.9, + solve_args={'tol':1.e-12}): + + self.solve_args = solve_args + + self.linear_part = query.sampler.affine_con.linear_part + self.offset = query.sampler.affine_con.offset + + self.logdens_linear = query.sampler.logdens_transform[0] + self.cond_mean = query.cond_mean + self.prec_opt = np.linalg.inv(query.cond_cov) + self.cond_cov = query.cond_cov + + self.observed_target = observed_target + self.cov_target_score = cov_target_score + self.cov_target = cov_target + + self.init_soln = query.observed_opt_state + self.grid = grid + + self.ntarget = cov_target.shape[0] + self.level = level + + def approx_log_reference(self, + observed_target, + cov_target, + cov_target_score): + + + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + + observed_target = np.atleast_1d(observed_target) + prec_target = np.linalg.inv(cov_target) + target_lin = - self.logdens_linear.dot(cov_target_score.T.dot(prec_target)) + + ref_hat = [] + solver = solve_barrier_affine_C + for k in range(self.grid.shape[0]): + cond_mean_grid = target_lin.dot(np.asarray([self.grid[k]])) + ( + self.cond_mean - target_lin.dot(observed_target)) + conjugate_arg = self.prec_opt.dot(cond_mean_grid) + + val, _, _ = solver(conjugate_arg, + self.prec_opt, + self.init_soln, + self.linear_part, + self.offset, + self.solve_args) + + ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) + + return np.asarray(ref_hat) + + + def approx_density(self, + mean_parameter, + cov_target, + approx_log_ref): + + + _approx_density = [] + for k in range(self.grid.shape[0]): + _approx_density.append(np.exp( + -np.true_divide((self.grid[k] - mean_parameter) ** 2, 2 * cov_target) + approx_log_ref[k])) + + _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) + + return np.cumsum(_approx_density_) + + + def approx_ci(self, + param_grid, + cov_target, + approx_log_ref, + indx_obsv): + + area = np.zeros(param_grid.shape[0]) + + for k in range(param_grid.shape[0]): + area_vec = approx_density(param_grid[k], + cov_target, + approx_log_ref) + + area[k] = area_vec[indx_obsv] + + alpha = 1 - self.level + region = param_grid[(area >= alpha / 2.) & (area <= (1 - alpha / 2.))] + + if region.size > 0: + return np.nanmin(region), np.nanmax(region) + else: + return 0., 0. + + def approx_pivot(self, + mean_parameter): + + pivot = [] + + for m in range(self.ntarget): + observed_target_uni = (self.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) + + approx_log_ref = self.approx_log_reference(self.grid, + observed_target_uni, + cov_target_uni, + cov_target_score_uni) + + area_cum = approx_density(self.grid, + mean_parameter, + cov_target_uni, + approx_log_ref) + + pivot.append(2 * np.minimum(area_cum[grid_indx_obs], 1. - area_cum[grid_indx_obs])) + + sys.stderr.write("variable completed " + str(m + 1)+ "\n") + + return pivot + diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 19fb677bb..557d6216b 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -14,7 +14,7 @@ constraints) from .posterior_inference import posterior from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C - +from .approx_reference import approximate_grid_inference class query(object): @@ -301,6 +301,25 @@ def prior(target_parameter): dispersion, solve_args=solve_args) + def approximate_grid_inference(self, + observed_target, + target_cov, + target_score_cov, + dispersion=None, + solve_args={'tol': 1.e-12}): + + if dispersion is None: + dispersion = 1 + print('Using dispersion parameter 1...') + + + return approximate_grid_inference(self, + observed_target, + target_cov, + target_score_cov, + dispersion, + solve_args=solve_args) + class gaussian_query(query): @@ -1570,6 +1589,7 @@ def _solve_barrier_nonneg(conjugate_arg, hess = np.linalg.inv(precision + np.diag(barrier_hessian(current))) return current_value, current, hess + def selective_MLE(observed_target, target_cov, target_score_cov, From 309306c74342a2cf0f154bb5163f8ae5082a9434 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Mon, 29 Jun 2020 02:52:37 -0400 Subject: [PATCH 054/187] added test for pivot b.o. approx reference --- selectinf/randomized/approx_reference.py | 138 +++++++++--------- selectinf/randomized/query.py | 4 + .../randomized/tests/test_approx_reference.py | 93 ++++++++++++ 3 files changed, 163 insertions(+), 72 deletions(-) create mode 100644 selectinf/randomized/tests/test_approx_reference.py diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 79096b0c4..8fc0b731c 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -1,8 +1,6 @@ from __future__ import division, print_function import numpy as np, sys -from scipy.stats import norm as ndist - from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C @@ -38,102 +36,98 @@ def __init__(self, self.ntarget = cov_target.shape[0] self.level = level - def approx_log_reference(self, - observed_target, - cov_target, - cov_target_score): - + def approx_log_reference(self, + observed_target, + cov_target, + cov_target_score): - if np.asarray(observed_target).shape in [(), (0,)]: + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') + observed_target = np.atleast_1d(observed_target) + prec_target = np.linalg.inv(cov_target) + target_lin = - self.logdens_linear.dot(cov_target_score.T.dot(prec_target)) - observed_target = np.atleast_1d(observed_target) - prec_target = np.linalg.inv(cov_target) - target_lin = - self.logdens_linear.dot(cov_target_score.T.dot(prec_target)) + ref_hat = [] + solver = solve_barrier_affine_C + for k in range(self.grid.shape[0]): + cond_mean_grid = target_lin.dot(np.asarray([self.grid[k]])) + ( + self.cond_mean - target_lin.dot(observed_target)) + conjugate_arg = self.prec_opt.dot(cond_mean_grid) - ref_hat = [] - solver = solve_barrier_affine_C - for k in range(self.grid.shape[0]): - cond_mean_grid = target_lin.dot(np.asarray([self.grid[k]])) + ( - self.cond_mean - target_lin.dot(observed_target)) - conjugate_arg = self.prec_opt.dot(cond_mean_grid) + val, _, _ = solver(conjugate_arg, + self.prec_opt, + self.init_soln, + self.linear_part, + self.offset, + **self.solve_args) - val, _, _ = solver(conjugate_arg, - self.prec_opt, - self.init_soln, - self.linear_part, - self.offset, - self.solve_args) + ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) - ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) + return np.asarray(ref_hat) - return np.asarray(ref_hat) + def approx_density(self, + mean_parameter, + cov_target, + approx_log_ref): - def approx_density(self, - mean_parameter, - cov_target, - approx_log_ref): + _approx_density = [] + for k in range(self.grid.shape[0]): + _approx_density.append(np.exp(-np.true_divide((self.grid[k] - mean_parameter) ** 2, 2 * cov_target) + approx_log_ref[k])) + _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) + return np.cumsum(_approx_density_) - _approx_density = [] - for k in range(self.grid.shape[0]): - _approx_density.append(np.exp( - -np.true_divide((self.grid[k] - mean_parameter) ** 2, 2 * cov_target) + approx_log_ref[k])) + def approx_ci(self, + param_grid, + cov_target, + approx_log_ref, + indx_obsv): - _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) + area = np.zeros(param_grid.shape[0]) - return np.cumsum(_approx_density_) + for k in range(param_grid.shape[0]): + area_vec = self.approx_density(param_grid[k], + cov_target, + approx_log_ref) + area[k] = area_vec[indx_obsv] - def approx_ci(self, - param_grid, - cov_target, - approx_log_ref, - indx_obsv): - - area = np.zeros(param_grid.shape[0]) + alpha = 1 - self.level + region = param_grid[(area >= alpha / 2.) & (area <= (1 - alpha / 2.))] - for k in range(param_grid.shape[0]): - area_vec = approx_density(param_grid[k], - cov_target, - approx_log_ref) + if region.size > 0: + return np.nanmin(region), np.nanmax(region) + else: + return 0., 0. - area[k] = area_vec[indx_obsv] + def approx_pivot(self, + mean_parameter): - alpha = 1 - self.level - region = param_grid[(area >= alpha / 2.) & (area <= (1 - alpha / 2.))] + pivot = [] - if region.size > 0: - return np.nanmin(region), np.nanmax(region) - else: - return 0., 0. + for m in range(self.ntarget): + p = self.cov_target_score.shape[1] + observed_target_uni = (self.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) - def approx_pivot(self, - mean_parameter): + approx_log_ref = self.approx_log_reference(observed_target_uni, + cov_target_uni, + cov_target_score_uni) - pivot = [] + area_cum = self.approx_density(mean_parameter[m], + cov_target_uni, + approx_log_ref) - for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) + pivot.append(2 * np.minimum(area_cum[grid_indx_obs], 1. - area_cum[grid_indx_obs])) - approx_log_ref = self.approx_log_reference(self.grid, - observed_target_uni, - cov_target_uni, - cov_target_score_uni) + sys.stderr.write("variable completed " + str(m + 1) + "\n") - area_cum = approx_density(self.grid, - mean_parameter, - cov_target_uni, - approx_log_ref) + return pivot - pivot.append(2 * np.minimum(area_cum[grid_indx_obs], 1. - area_cum[grid_indx_obs])) - sys.stderr.write("variable completed " + str(m + 1)+ "\n") - return pivot diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 557d6216b..1185adebb 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -305,6 +305,7 @@ def approximate_grid_inference(self, observed_target, target_cov, target_score_cov, + grid = None, dispersion=None, solve_args={'tol': 1.e-12}): @@ -312,11 +313,14 @@ def approximate_grid_inference(self, dispersion = 1 print('Using dispersion parameter 1...') + if grid is None: + grid = np.linspace(- 20., 20., num=401) return approximate_grid_inference(self, observed_target, target_cov, target_score_cov, + grid, dispersion, solve_args=solve_args) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py new file mode 100644 index 000000000..c817582d7 --- /dev/null +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -0,0 +1,93 @@ +import numpy as np + +from ...tests.instance import gaussian_instance +from ..lasso import lasso, selected_targets + + +def test_approx_pivot(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.): + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * dispersion) + + signs = conv.fit() + nonzero = signs != 0 + + if nonzero.sum()>0: + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + grid = np.linspace(- 20., 20., num=401) + + approximate_grid_inf = conv.approximate_grid_inference(observed_target, + cov_target, + cov_target_score, + grid=grid, + dispersion=dispersion) + + pivot = approximate_grid_inf.approx_pivot(beta_target) + + return pivot + + +import matplotlib.pyplot as plt +from statsmodels.distributions.empirical_distribution import ECDF + + +def main(nsim=300): + + _pivot=[] + for i in range(nsim): + + _pivot.extend(test_approx_pivot(n=100, + p=50, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.20, + randomizer_scale=1.)) + + print("iteration completed ", i) + + plt.clf() + ecdf_MLE = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + +if __name__ =="__main__": + main(nsim=50) \ No newline at end of file From b35377e90b8432fc09d152750f4c26f8c2067fc6 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Tue, 30 Jun 2020 01:17:38 -0400 Subject: [PATCH 055/187] added approximate ci b.o. approx reference --- selectinf/randomized/approx_reference.py | 29 ++++ .../randomized/tests/test_approx_reference.py | 145 +++++++++++++++--- 2 files changed, 152 insertions(+), 22 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 8fc0b731c..3c1b1b8fa 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -128,6 +128,35 @@ def approx_pivot(self, return pivot + def approx_intervals(self, + param_grid): + + intervals_lci =[] + intervals_uci =[] + + for m in range(self.ntarget): + p = self.cov_target_score.shape[1] + observed_target_uni = (self.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) + + approx_log_ref = self.approx_log_reference(observed_target_uni, + cov_target_uni, + cov_target_score_uni) + + approx_lci, approx_uci = self.approx_ci(param_grid[m,:], + cov_target_uni, + approx_log_ref, + grid_indx_obs) + + intervals_lci.append(approx_lci) + intervals_uci.append(approx_uci) + + sys.stderr.write("variable completed " + str(m + 1) + "\n") + + return np.asarray(intervals_lci), np.asarray(intervals_uci) + diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index c817582d7..c394559d6 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -50,7 +50,14 @@ def test_approx_pivot(n=500, nonzero, dispersion=dispersion) - grid = np.linspace(- 20., 20., num=401) + inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[1] + + scale_ = 4 * np.max(np.sqrt(np.diag(inverse_info))) + ngrid = 2 * scale_/0.1 + + grid = np.linspace(- scale_, scale_, num=ngrid) approximate_grid_inf = conv.approximate_grid_inference(observed_target, cov_target, @@ -63,31 +70,125 @@ def test_approx_pivot(n=500, return pivot -import matplotlib.pyplot as plt -from statsmodels.distributions.empirical_distribution import ECDF +def test_approx_ci(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.): + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) -def main(nsim=300): + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] - _pivot=[] - for i in range(nsim): + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - _pivot.extend(test_approx_pivot(n=100, - p=50, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.20, - randomizer_scale=1.)) + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * dispersion) + + signs = conv.fit() + nonzero = signs != 0 - print("iteration completed ", i) + if nonzero.sum()>0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + ntarget = observed_target.shape[0] + result, inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[:2] + + _scale = 4 * np.sqrt(np.diag(inverse_info)) + scale_ = np.max(_scale) + ngrid = int(2 * scale_/0.1) + + grid = np.linspace(-scale_, scale_, num=ngrid) + + approximate_grid_inf = conv.approximate_grid_inference(observed_target, + cov_target, + cov_target_score, + grid=grid, + dispersion=dispersion) + + + param_grid = np.zeros((ntarget, ngrid)) + mle = np.asarray(result['MLE']) + for j in range(ntarget): + param_grid[j,:] = np.linspace(mle[j]-_scale[j], mle[j]+_scale[j], num=ngrid) + + lci, uci = approximate_grid_inf.approx_intervals(param_grid) + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + + return np.mean(coverage), np.mean(length) + +import matplotlib.pyplot as plt +from statsmodels.distributions.empirical_distribution import ECDF - plt.clf() - ecdf_MLE = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') - plt.plot(grid, grid, 'k--') - plt.show() -if __name__ =="__main__": - main(nsim=50) \ No newline at end of file +def main(nsim=300, CI = False): + + if CI is False: + _pivot = [] + for i in range(nsim): + _pivot.extend(test_approx_pivot(n=200, + p=100, + signal_fac=0.5, + s=5, + sigma=3., + rho=0.20, + randomizer_scale=1.)) + + print("iteration completed ", i) + + plt.clf() + ecdf_MLE = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + + if CI is True: + coverage_ = 0. + length_ = 0. + for n in range(nsim): + cov, len = test_approx_ci(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.) + + coverage_ += cov + length_ += len + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.)) + print("iteration completed ", n + 1) + +if __name__ == "__main__": + main(nsim=20, CI = True) \ No newline at end of file From 9223cccf077fe3d9eb56744ed6dd59e225986442 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Tue, 30 Jun 2020 18:03:05 -0400 Subject: [PATCH 056/187] sigma instead of sigma_sq while setting scale parameter of posterior samplers --- selectinf/randomized/posterior_inference.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 309779fa0..355bab0e0 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -174,12 +174,12 @@ def langevin_sampler(selective_posterior, selective_posterior.log_posterior, proposal_scale, stepsize, - selective_posterior.dispersion) + np.sqrt(selective_posterior.dispersion)) samples = np.zeros((nsample, selective_posterior.ntarget)) for i, sample in enumerate(sampler): - sampler.scaling = selective_posterior.dispersion + sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i,:] = sample.copy() if i == nsample - 1: break @@ -202,22 +202,22 @@ def gibbs_sampler(selective_posterior, selective_posterior.log_posterior, proposal_scale, stepsize, - selective_posterior.dispersion) + np.sqrt(selective_posterior.dispersion)) samples = np.zeros((nsample, selective_posterior.ntarget)) scale_samples = np.zeros(nsample) - scale_update = selective_posterior.dispersion + scale_update = np.sqrt(selective_posterior.dispersion) for i in range(nsample): sample = sampler.__next__() samples[i, :] = sample - scale_update = invgamma.rvs(a=(0.1 + + scale_update_sq = invgamma.rvs(a=(0.1 + selective_posterior.ntarget + selective_posterior.ntarget/2), - scale=0.1-(scale_update * sampler.grad_posterior[1]), - size=1) - scale_samples[i] = scale_update - sampler.scaling = scale_update + scale=0.1-((scale_update**2) * sampler.grad_posterior[1]), + size=1) + scale_samples[i] = np.sqrt(scale_update_sq) + sampler.scaling = np.sqrt(scale_update_sq) return samples[nburnin:, :], scale_samples[nburnin:] From 4b77d8a8129096909d5adac389be15a428533178 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Thu, 2 Jul 2020 12:54:50 -0400 Subject: [PATCH 057/187] added hiv test: carved posterior interval estimates --- selectinf/randomized/posterior_inference.py | 4 +- selectinf/randomized/tests/test_posterior.py | 182 ++++++++++++++++--- 2 files changed, 156 insertions(+), 30 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 355bab0e0..d8be08029 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -48,7 +48,7 @@ def __init__(self, offset = query.sampler.affine_con.offset logdens_linear = query.sampler.logdens_transform[0] - _, self.inverse_info, log_ref = query.selective_MLE(observed_target, + result, self.inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, cov_target_score) @@ -69,7 +69,7 @@ def __init__(self, self.linear_part = linear_part self.offset = offset - self.initial_estimate = observed_target + self.initial_estimate = np.asarray(result['MLE']) self.dispersion = dispersion self.log_ref = log_ref diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 039c72a90..bba39845a 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -1,9 +1,11 @@ import numpy as np +import pandas as pd +import statsmodels.api as sm +from scipy.stats import norm as ndist from ...tests.instance import gaussian_instance -from ..lasso import lasso, selected_targets -from ..posterior_inference import (posterior, - langevin_sampler, +from ..lasso import lasso, selected_targets, split_lasso +from ..posterior_inference import (langevin_sampler, gibbs_sampler) def test_Langevin(n=500, @@ -13,8 +15,8 @@ def test_Langevin(n=500, sigma=3., rho=0.4, randomizer_scale=1., - nsample=100, - nburnin=50): + nsample=1500, + nburnin=100): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -62,9 +64,9 @@ def test_Langevin(n=500, nsample=nsample, nburnin=nburnin) - gibbs_samples = gibbs_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin) + # gibbs_samples = gibbs_sampler(posterior_inf, + # nsample=nsample, + # nburnin=nburnin) lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) @@ -109,11 +111,12 @@ def test_instance(nsample=100, nburnin=50): gibbs_samples = gibbs_sampler(posterior_inf, nsample=nsample, - nburnin=nburnin) + nburnin=nburnin)[0] lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) + beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) coverage = (lci < beta_target) * (uci > beta_target) length = uci - lci @@ -225,30 +228,153 @@ def prior(target_parameter): return samples -def main(ndraw=10): +def test_hiv_data(nsample=1000, + nburnin=100, + alpha =0.10, + split_proportion=0.50, + seedn = 1): + + np.random.seed(seedn) + + level = 1 - alpha / 2. + Z_quantile = ndist.ppf(level) + + NRTI = pd.read_csv("http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", + na_values="NA", sep='\t') + + NRTI_specific = [] + NRTI_muts = [] + + for i in range(1, 241): + d = NRTI['P%d' % i] + for mut in np.unique(d): + if mut not in ['-', '.'] and len(mut) == 1: + test = np.equal(d, mut) + if test.sum() > 10: + NRTI_specific.append(np.array(np.equal(d, mut))) + NRTI_muts.append("P%d%s" % (i, mut)) + + NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) + + X_NRTI = np.array(NRTI_specific, np.float) + Y = NRTI['3TC'] # shorthand + keep = ~np.isnan(Y).astype(np.bool) + X_NRTI = X_NRTI[np.nonzero(keep)] + + Y = Y[keep] + Y = np.array(np.log(Y), np.float) + Y -= Y.mean() + X_NRTI -= X_NRTI.mean(0)[None, :] + X_NRTI /= X_NRTI.std(0)[None, :] + X = X_NRTI + n, p = X.shape + X /= np.sqrt(n) + - coverage_ = 0. - length_ = 0. - for n in range(ndraw): - # cov, len = test_Langevin(n=500, - # p=200, - # signal_fac=1.5, - # s=5, - # sigma=2., - # rho=0.2, - # randomizer_scale=1. - # ) + ols_fit = sm.OLS(Y, X).fit() + _sigma = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) - cov, len = test_instance(nsample=2000, - nburnin=100) + const = split_lasso.gaussian - coverage_ += cov - length_ += len + dispersion = _sigma ** 2 - print("coverage so far ", coverage_ / (n + 1.)) - print("lengths so far ", length_ / (n + 1.)) - print("iteration completed ", n + 1) + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma + conv = const(X, + Y, + W, + proportion=split_proportion) + + signs = conv.fit() + nonzero = signs != 0 + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + + posterior_inf = conv.posterior(observed_target, + cov_target, + cov_target_score, + dispersion=dispersion) + + samples_langevin = langevin_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin, + step=1.) + + lci_langevin = np.percentile(samples_langevin, int((1-level)*100), axis=0) + uci_langevin = np.percentile(samples_langevin, int((level)*100), axis=0) + + samples_gibbs = gibbs_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin)[0] + + lci_gibbs = np.percentile(samples_gibbs, int((1 - level) * 100), axis=0) + uci_gibbs = np.percentile(samples_gibbs, int((level) * 100), axis=0) + + naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) + naive_cov = _sigma * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) + naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), + naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T + + X_split = X[~conv._selection_idx, :] + Y_split = Y[~conv._selection_idx] + split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) + split_cov = _sigma * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) + split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), + split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T + + print("lengths: adjusted intervals Langevin, Gibbs, MLE ", np.mean(uci_langevin - lci_langevin), np.mean(uci_gibbs - lci_gibbs), + np.mean((2* Z_quantile )* np.sqrt(np.diag(posterior_inf.inverse_info)))) + + print("lengths: naive intervals ", np.mean(naive_intervals[:,1]-naive_intervals[:,0])) + + print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) + + output = pd.DataFrame({'Langevin_lower_confidence': lci_langevin, + 'Langevin_upper_confidence': uci_langevin, + 'Gibbs_lower_confidence': lci_gibbs, + 'Gibbs_upper_confidence': uci_gibbs, + 'Split_lower_confidence': split_intervals[:,0], + 'Split_upper_confidence': split_intervals[:, 1], + 'Naive_lower_confidence': naive_intervals[:, 0], + 'Naive_upper_confidence': naive_intervals[:, 1] + }) + + return output + +# def main(ndraw=10): +# +# coverage_ = 0. +# length_ = 0. +# for n in range(ndraw): +# cov, len = test_Langevin(n=500, +# p=200, +# signal_fac=1., +# s=5, +# sigma=3., +# rho=0.2, +# randomizer_scale=1. +# ) +# +# # cov, len = test_instance(nsample=2000, +# # nburnin=100) +# +# coverage_ += cov +# length_ += len +# +# print("coverage so far ", coverage_ / (n + 1.)) +# print("lengths so far ", length_ / (n + 1.)) +# print("iteration completed ", n + 1) + + +def main(): + test_hiv_data(split_proportion=0.50) if __name__ == "__main__": main() From 8f4e6128af512fa25cedca2ddaa18e311fddcd91 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 12:13:49 -0700 Subject: [PATCH 058/187] changing order of output of log posterior --- doc/adjusted_MLE/tests/comparison_metrics.py | 935 ------------------- selectinf/randomized/posterior_inference.py | 40 +- selectinf/randomized/query.py | 59 +- 3 files changed, 56 insertions(+), 978 deletions(-) delete mode 100644 doc/adjusted_MLE/tests/comparison_metrics.py diff --git a/doc/adjusted_MLE/tests/comparison_metrics.py b/doc/adjusted_MLE/tests/comparison_metrics.py deleted file mode 100644 index 15a003d0e..000000000 --- a/doc/adjusted_MLE/tests/comparison_metrics.py +++ /dev/null @@ -1,935 +0,0 @@ -<<<<<<< HEAD:doc/adjusted_MLE/tests/comparison_metrics.py -from __future__ import division, print_function -import numpy as np, sys, time -from scipy.stats import norm as ndist -======= -import numpy as np, os, itertools -import pandas as pd - -import rpy2.robjects as rpy -from rpy2.robjects import numpy2ri -rpy.numpy2ri.activate() -from scipy.stats import norm as ndist - -from ..lasso import lasso, full_targets, selected_targets, debiased_targets -from ...algorithms.lasso import lasso as lasso_full - -def sim_xy(n, p, nval, rho=0, s=5, beta_type=2, snr=1): - - rpy.r(''' - source('~/best-subset/bestsubset/R/sim.R') - sim_xy = sim.xy - ''') ->>>>>>> commit changes to test_mle:selectinf/randomized/tests/test_cv_mle.py - -from rpy2 import robjects -import rpy2.robjects.numpy2ri - -from ...randomized.lasso import lasso, full_targets, selected_targets, debiased_targets -from ...algorithms.lasso import ROSI -from ...tests.instance import gaussian_instance - -def BHfilter(pval, q=0.2): - pval = np.asarray(pval) - pval_sort = np.sort(pval) - comparison = q * np.arange(1, pval.shape[0] + 1.) / pval.shape[0] - passing = pval_sort < comparison - if passing.sum(): - thresh = comparison[np.nonzero(passing)[0].max()] - return np.nonzero(pval <= thresh)[0] - return [] - -def sim_xy(n, - p, - nval, - rho=0, - s=5, - beta_type=2, - snr=1): - try: - rpy2.robjects.numpy2ri.activate() - robjects.r(''' - #library(bestsubset) - source('~/best-subset/bestsubset/R/sim.R') - sim_xy = sim.xy - ''') - - r_simulate = robjects.globalenv['sim_xy'] - sim = r_simulate(n, p, nval, rho, s, beta_type, snr) - X = np.array(sim.rx2('x')) - y = np.array(sim.rx2('y')) - X_val = np.array(sim.rx2('xval')) - y_val = np.array(sim.rx2('yval')) - Sigma = np.array(sim.rx2('Sigma')) - beta = np.array(sim.rx2('beta')) - sigma = np.array(sim.rx2('sigma')) - rpy2.robjects.numpy2ri.deactivate() - return X, y, X_val, y_val, Sigma, beta, sigma - except: - X, y, beta, _, sigma, Sigma = gaussian_instance(n=n, - p=p, - s=s, - signal=snr, - equicorrelated=False, - rho=rho) - X_val = gaussian_instance(n=n, - p=p, - s=s, - signal=snr, - equicorrelated=False, - rho=rho)[0] - y_val = X_val.dot(beta) + sigma * np.random.standard_normal(X_val.shape[0]) - return X, y, X_val, y_val, Sigma, beta, sigma - -def selInf_R(X, y, beta, lam, sigma, Type, alpha=0.1): - robjects.r(''' - library("selectiveInference") - selInf = function(X, y, beta, lam, sigma, Type, alpha= 0.1){ - y = as.matrix(y) - X = as.matrix(X) - beta = as.matrix(beta) - lam = as.matrix(lam)[1,1] - sigma = as.matrix(sigma)[1,1] - Type = as.matrix(Type)[1,1] - if(Type == 1){ - type = "full"} else{ - type = "partial"} - inf = fixedLassoInf(x = X, y = y, beta = beta, lambda=lam, family = "gaussian", - intercept=FALSE, sigma=sigma, alpha=alpha, type=type) - return(list(ci = inf$ci, pvalue = inf$pv))} - ''') - - inf_R = robjects.globalenv['selInf'] - n, p = X.shape - r_X = robjects.r.matrix(X, nrow=n, ncol=p) - r_y = robjects.r.matrix(y, nrow=n, ncol=1) - r_beta = robjects.r.matrix(beta, nrow=p, ncol=1) - r_lam = robjects.r.matrix(lam, nrow=1, ncol=1) - r_sigma = robjects.r.matrix(sigma, nrow=1, ncol=1) - r_Type = robjects.r.matrix(Type, nrow=1, ncol=1) - output = inf_R(r_X, r_y, r_beta, r_lam, r_sigma, r_Type) - ci = np.array(output.rx2('ci')) - pvalue = np.array(output.rx2('pvalue')) - return ci, pvalue - - -def glmnet_lasso(X, y, lambda_val): - robjects.r(''' - library(glmnet) - glmnet_LASSO = function(X,y, lambda){ - y = as.matrix(y) - X = as.matrix(X) - lam = as.matrix(lambda)[1,1] - n = nrow(X) - - fit = glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) - estimate = coef(fit, s=lam, exact=TRUE, x=X, y=y)[-1] - fit.cv = cv.glmnet(X, y, standardize=TRUE, intercept=FALSE, thresh=1.e-10) - estimate.1se = coef(fit.cv, s='lambda.1se', exact=TRUE, x=X, y=y)[-1] - estimate.min = coef(fit.cv, s='lambda.min', exact=TRUE, x=X, y=y)[-1] - return(list(estimate = estimate, estimate.1se = estimate.1se, - estimate.min = estimate.min, - lam.min = fit.cv$lambda.min, - lam.1se = fit.cv$lambda.1se)) - }''') - - lambda_R = robjects.globalenv['glmnet_LASSO'] - n, p = X.shape - r_X = robjects.r.matrix(X, nrow=n, ncol=p) - r_y = robjects.r.matrix(y, nrow=n, ncol=1) - r_lam = robjects.r.matrix(lambda_val, nrow=1, ncol=1) - - estimate = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate')) - estimate_1se = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.1se')) - estimate_min = np.array(lambda_R(r_X, r_y, r_lam).rx2('estimate.min')) - lam_min = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.min'))) - lam_1se = np.asscalar(np.array(lambda_R(r_X, r_y, r_lam).rx2('lam.1se'))) - return estimate, estimate_1se, estimate_min, lam_min, lam_1se - -def coverage(intervals, pval, target, truth): - pval_alt = (pval[truth != 0]) < 0.1 - if pval_alt.sum() > 0: - avg_power = np.mean(pval_alt) - else: - avg_power = 0. - return np.mean((target > intervals[:, 0]) * (target < intervals[:, 1])), avg_power - -def relative_risk(est, truth, Sigma): - if (truth != 0).sum > 0: - return (est - truth).T.dot(Sigma).dot(est - truth) / truth.T.dot(Sigma).dot(truth) - else: - return (est - truth).T.dot(Sigma).dot(est - truth) - - -def comparison_cvmetrics_selected(n=500, - p=100, - nval=500, - rho=0.35, - s=5, - beta_type=1, - snr=0.20, - randomizer_scale=np.sqrt(0.50), - full_dispersion=True, - tuning_nonrand="lambda.min", - tuning_rand="lambda.1se"): - - (X, y, _, _, Sigma, beta, sigma) = sim_xy(n=n, - p=p, - nval=nval, - rho=rho, - s=s, - beta_type=beta_type, - snr=snr) - - true_mean = X.dot(beta) - print("snr", snr) - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - sigma_ = np.std(y) - print("estimated and true sigma", sigma, sigma_) - - lam_theory = (sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, - np.random.standard_normal((n, 2000)))).max(0))) - (glm_LASSO_theory, - glm_LASSO_1se, - glm_LASSO_min, - lam_min, - lam_1se) = glmnet_lasso(X, y, lam_theory / n) - - if tuning_nonrand == "lambda.min": - lam_LASSO = lam_min - glm_LASSO = glm_LASSO_min - elif tuning_nonrand == "lambda.1se": - lam_LASSO = lam_1se - glm_LASSO = glm_LASSO_1se - else: - lam_LASSO = lam_theory/float(n) - glm_LASSO = glm_LASSO_theory - active_LASSO = (glm_LASSO != 0) - nactive_LASSO = active_LASSO.sum() - active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) - active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for - z in range(nactive_LASSO)], np.bool) - - rel_LASSO = np.zeros(p) - Lee_nreport = 0 - bias_Lee = 0. - bias_naive = 0. - - if nactive_LASSO > 0: - post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) - rel_LASSO[active_LASSO] = post_LASSO_OLS - Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) - Lee_intervals, Lee_pval = selInf_R(X, - y, - glm_LASSO, - n * lam_LASSO, - sigma_, - Type=0, - alpha=0.1) - - if (Lee_pval.shape[0] == Lee_target.shape[0]): - - cov_Lee, selective_Lee_power = coverage(Lee_intervals, - Lee_pval, - Lee_target, - beta[active_LASSO]) - - inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) - inf_entries = np.mean(inf_entries_bool) - if inf_entries == 1.: - length_Lee = 0. - else: - length_Lee = (np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0]) - [~inf_entries_bool])) - power_Lee = ((active_LASSO_bool) * (np.logical_or((0. < Lee_intervals[:, 0]), - (0. > Lee_intervals[:, 1])))) \ - .sum() / float((beta != 0).sum()) - Lee_discoveries = BHfilter(Lee_pval, q=0.1) - power_Lee_BH = ((Lee_discoveries * active_LASSO_bool).sum() / - float((beta != 0).sum())) - fdr_Lee_BH = ((Lee_discoveries * ~active_LASSO_bool).sum() / - float(max(Lee_discoveries.sum(), 1.))) - bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) - - naive_sd = sigma_ * np.sqrt(np.diag( - (np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) - naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, - post_LASSO_OLS + 1.65 * naive_sd]).T - naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) - - cov_naive, selective_naive_power = coverage(naive_intervals, - naive_pval, - Lee_target, - beta[active_LASSO]) - - length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) - power_naive = ((active_LASSO_bool) * ( - np.logical_or((0. < naive_intervals[:, 0]), - (0. > naive_intervals[:, 1])))).sum() / float( - (beta != 0).sum()) - - naive_discoveries = BHfilter(naive_pval, q=0.1) - - power_naive_BH = ((naive_discoveries * active_LASSO_bool).sum() / - float((beta != 0).sum())) - fdr_naive_BH = ((naive_discoveries * ~active_LASSO_bool).sum() / - float(max(naive_discoveries.sum(), 1.))) - - bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) - - partial_Lasso_risk = (glm_LASSO[active_LASSO]-Lee_target).T.dot( - glm_LASSO[active_LASSO]-Lee_target) - partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot( - post_LASSO_OLS - Lee_target) - - else: - Lee_nreport = 1 - (cov_Lee, - length_Lee, - inf_entries, - power_Lee, - power_Lee_BH, - fdr_Lee_BH, - selective_Lee_power) = [0., 0., 0., 0., 0., 0., 0.] - - (cov_naive, - length_naive, - power_naive, - power_naive_BH, - fdr_naive_BH, - selective_naive_power) = [0., 0., 0., 0., 0., 0.] - - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - elif nactive_LASSO == 0: - Lee_nreport = 1 - (cov_Lee, - length_Lee, - inf_entries, - power_Lee, - power_Lee_BH, - fdr_Lee_BH, - selective_Lee_power) = [0., 0., 0., 0., 0., 0., 0.] - - (cov_naive, - length_naive, - power_naive, - power_naive_BH, - fdr_naive_BH, - selective_naive_power) = [0., 0., 0., 0., 0., 0.] - - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_min * np.ones(p), - randomizer_scale= np.sqrt(n) * - randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_1se * np.ones(p), - randomizer_scale= np.sqrt(n) * - randomizer_scale * sigma_) - else: - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * - randomizer_scale * sigma_) - signs = randomized_lasso.fit() - nonzero = signs != 0 - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) - sel_MLE = np.zeros(p) - ind_est = np.zeros(p) - randomized_lasso_est = np.zeros(p) - randomized_rel_lasso_est = np.zeros(p) - MLE_nreport = 0 - - sys.stderr.write("active variables selected by cv LASSO " + str(nactive_LASSO) + "\n") - sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") - - if nonzero.sum() > 0: - target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - - MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score, - alternatives) - sel_MLE[nonzero] = MLE_estimate - ind_est[nonzero] = ind_unbiased_estimator - randomized_lasso_est = randomized_lasso.initial_soln - randomized_rel_lasso_est = randomized_lasso._beta_full - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * ( - np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) - partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) - partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) - partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) - else: - MLE_nreport = 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] - MLE_discoveries = np.zeros(1) - partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] - - risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), - relative_risk(ind_est, beta, Sigma), - relative_risk(randomized_lasso_est, beta, Sigma), - relative_risk(randomized_rel_lasso_est, beta, Sigma), - relative_risk(rel_LASSO, beta, Sigma), - relative_risk(glm_LASSO, beta, Sigma))) - - partial_risks = np.vstack((partial_MLE_risk, - partial_ind_risk, - partial_randLasso_risk, - partial_relrandLasso_risk, - partial_relLasso_risk, - partial_Lasso_risk)) - - naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, - naive_discoveries.sum())) - Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH, - Lee_discoveries.sum())) - Liu_inf = np.zeros((10, 1)) - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, - MLE_discoveries.sum())) - nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) - return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) - - -def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, - randomizer_scale=np.sqrt(0.25), full_dispersion=True, - tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): - - X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) - print("snr", snr) - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - sigma_ = np.std(y) - print("estimated and true sigma", sigma, sigma_) - - lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) - glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory/float(n)) - if tuning_nonrand == "lambda.min": - lam_LASSO = lam_min - glm_LASSO = glm_LASSO_min - elif tuning_nonrand == "lambda.1se": - lam_LASSO = lam_1se - glm_LASSO = glm_LASSO_1se - else: - lam_LASSO = lam_theory/float(n) - glm_LASSO = glm_LASSO_theory - - active_LASSO = (glm_LASSO != 0) - nactive_LASSO = active_LASSO.sum() - active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) - active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], - np.bool) - - rel_LASSO = np.zeros(p) - Lee_nreport = 0 - bias_Lee = 0. - bias_naive = 0. - - if nactive_LASSO > 0: - rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) - Lee_target = beta[active_LASSO] - Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=1, alpha=0.1) - - if (Lee_pval.shape[0] == Lee_target.shape[0]): - - cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) - inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) - inf_entries = np.mean(inf_entries_bool) - if inf_entries == 1.: - length_Lee = 0. - else: - length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) - power_Lee = ((active_LASSO_bool) * ( - np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))).sum() / float((beta != 0).sum()) - Lee_discoveries = BHfilter(Lee_pval, q=0.1) - power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(max(Lee_discoveries.sum(), 1.)) - bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) - - post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) - naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) - naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, - post_LASSO_OLS + 1.65 * naive_sd]).T - naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) - cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) - length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) - power_naive = ((active_LASSO_bool) * ( - np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( - (beta != 0).sum()) - naive_discoveries = BHfilter(naive_pval, q=0.1) - power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) - bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) - - partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) - partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) - else: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - elif nactive_LASSO == 0: - Lee_nreport = 1 - cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [0., 0., 0., 0., 0., 0., 0.] - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - Lee_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - lasso_Liu = ROSI.gaussian(X, y, n * lam_LASSO) - Lasso_soln_Liu = lasso_Liu.fit() - active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0] - nactive_Liu = active_set_Liu.shape[0] - active_Liu_bool = np.asarray([(np.in1d(active_set_Liu[a], true_set).sum() > 0) for a in range(nactive_Liu)], np.bool) - Liu_nreport = 0 - - if nactive_Liu > 0: - Liu_target = beta[Lasso_soln_Liu != 0] - df = lasso_Liu.summary(level=0.90, compute_intervals=True, dispersion=dispersion) - Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \ - np.asarray(df['upper_confidence']), \ - np.asarray(df['pval']) - Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T - cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval, Liu_target, beta[Lasso_soln_Liu != 0]) - length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0]) - power_Liu = ((active_Liu_bool) * (np.logical_or((0. < Liu_intervals[:, 0]), - (0. > Liu_intervals[:, 1])))).sum() / float((beta != 0).sum()) - Liu_discoveries = BHfilter(Liu_pval, q=0.1) - power_Liu_BH = (Liu_discoveries * active_Liu_bool).sum() / float((beta != 0).sum()) - fdr_Liu_BH = (Liu_discoveries * ~active_Liu_bool).sum() / float(max(Liu_discoveries.sum(), 1.)) - - else: - Liu_nreport = 1 - cov_Liu, length_Liu, power_Liu, power_Liu_BH, fdr_Liu_BH, selective_Liu_power = [0., 0., 0., 0., 0., 0.] - Liu_discoveries = np.zeros(1) - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= n * lam_min * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= n * lam_1se * np.ones(p), - randomizer_scale= np.sqrt(n) * randomizer_scale * sigma_) - else: - randomized_lasso = lasso.gaussian(X, - y, - feature_weights= lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - signs = randomized_lasso.fit() - nonzero = signs != 0 - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) - sel_MLE = np.zeros(p) - ind_est = np.zeros(p) - randomized_lasso_est = np.zeros(p) - randomized_rel_lasso_est = np.zeros(p) - MLE_nreport = 0 - - if nonzero.sum() > 0: - target_randomized = beta[nonzero] - (observed_target, - cov_target, - cov_target_score, - alternatives) = full_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score, - alternatives) - sel_MLE[nonzero] = MLE_estimate - ind_est[nonzero] = ind_unbiased_estimator - randomized_lasso_est = randomized_lasso.initial_soln - randomized_rel_lasso_est = randomized_lasso._beta_full - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * (np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) - partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) - partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) - partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) - else: - MLE_nreport = 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., 0., 0.] - MLE_discoveries = np.zeros(1) - partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] - - risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), - relative_risk(ind_est, beta, Sigma), - relative_risk(randomized_lasso_est, beta, Sigma), - relative_risk(randomized_rel_lasso_est, beta, Sigma), - relative_risk(rel_LASSO, beta, Sigma), - relative_risk(glm_LASSO, beta, Sigma))) - - partial_risks = np.vstack((partial_MLE_risk, - partial_ind_risk, - partial_randLasso_risk, - partial_relrandLasso_risk, - partial_relLasso_risk, - partial_Lasso_risk)) - - naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, - power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) - Lee_inf = np.vstack((cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, - power_Lee, power_Lee_BH, fdr_Lee_BH, Lee_discoveries.sum())) - Liu_inf = np.vstack((cov_Liu, length_Liu, 0., nactive_Liu, bias_Lee, selective_Liu_power, - power_Liu, power_Liu_BH, fdr_Liu_BH, Liu_discoveries.sum())) - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, - power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) - nreport = np.vstack((Lee_nreport, Liu_nreport, MLE_nreport)) - return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) - -def comparison_cvmetrics_debiased(n=100, p=150, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, - randomizer_scale=np.sqrt(0.25), full_dispersion=False, - tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): - - X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) - print("snr", snr) - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - _sigma_ = np.std(y) - - lam_theory = _sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) - glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n)) - - if full_dispersion is False: - dispersion = None - active_min = (glm_LASSO_min != 0) - if active_min.sum() > 0: - sigma_ = np.sqrt(np.linalg.norm(y - X[:, active_min].dot(np.linalg.pinv(X[:, active_min]).dot(y))) ** 2 - / (n - active_min.sum())) - else: - sigma_ = _sigma_ - print("estimated and true sigma", sigma, _sigma_, sigma_) - - if tuning_nonrand == "lambda.min": - lam_LASSO = lam_min - glm_LASSO = glm_LASSO_min - elif tuning_nonrand == "lambda.1se": - lam_LASSO = lam_1se - glm_LASSO = glm_LASSO_1se - else: - lam_LASSO = lam_theory / float(n) - glm_LASSO = glm_LASSO_theory - - active_LASSO = (glm_LASSO != 0) - nactive_LASSO = active_LASSO.sum() - active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) - active_LASSO_bool = np.asarray([(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], - np.bool) - - rel_LASSO = np.zeros(p) - Lee_nreport = 0. - bias_naive = 0. - - if nactive_LASSO > 0: - rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) - Lee_target = beta[active_LASSO] - post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) - naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) - naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, - post_LASSO_OLS + 1.65 * naive_sd]).T - naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) - cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) - length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) - power_naive = ((active_LASSO_bool) * ( - np.logical_or((0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( - (beta != 0).sum()) - naive_discoveries = BHfilter(naive_pval, q=0.1) - power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float((beta != 0).sum()) - fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float(max(naive_discoveries.sum(), 1.)) - bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) - - partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) - partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) - - elif nactive_LASSO == 0: - Lee_nreport += 1 - cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [0., 0., 0., 0., 0., 0.] - naive_discoveries = np.zeros(1) - partial_Lasso_risk, partial_relLasso_risk = [0., 0.] - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_min * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_1se * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - else: - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - signs = randomized_lasso.fit() - nonzero = signs != 0 - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], - np.bool) - sel_MLE = np.zeros(p) - ind_est = np.zeros(p) - randomized_lasso_est = np.zeros(p) - randomized_rel_lasso_est = np.zeros(p) - MLE_nreport = 0 - - if nonzero.sum() > 0: - target_randomized = beta[nonzero] - (observed_target, - cov_target, - cov_target_score, - alternatives) = debiased_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - penalty=randomized_lasso.penalty, - dispersion=dispersion) - MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score, - alternatives) - sel_MLE[nonzero] = MLE_estimate - ind_est[nonzero] = ind_unbiased_estimator - randomized_lasso_est = randomized_lasso.initial_soln - randomized_rel_lasso_est = randomized_lasso._beta_full - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * ( - np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) - partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot( - ind_unbiased_estimator - target_randomized) - partial_randLasso_risk = (randomized_lasso_est[nonzero] - target_randomized).T.dot( - randomized_lasso_est[nonzero] - target_randomized) - partial_relrandLasso_risk = (randomized_rel_lasso_est[nonzero] - target_randomized).T.dot( - randomized_rel_lasso_est[nonzero] - target_randomized) - else: - MLE_nreport = 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [0., 0., 0., 0., 0., - 0., 0.] - MLE_discoveries = np.zeros(1) - partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [0., 0., 0., 0.] - - risks = np.vstack((relative_risk(sel_MLE, beta, Sigma), - relative_risk(ind_est, beta, Sigma), - relative_risk(randomized_lasso_est, beta, Sigma), - relative_risk(randomized_rel_lasso_est, beta, Sigma), - relative_risk(rel_LASSO, beta, Sigma), - relative_risk(glm_LASSO, beta, Sigma))) - - partial_risks = np.vstack((partial_MLE_risk, - partial_ind_risk, - partial_randLasso_risk, - partial_relrandLasso_risk, - partial_relLasso_risk, - partial_Lasso_risk)) - - naive_inf = np.vstack((cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, - power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) - Lee_inf = np.zeros((10,1)) - Liu_inf = np.zeros((10,1)) - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, - power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) - nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) - return np.vstack((risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport)) - - -def compare_sampler_MLE(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, target= "selected", - randomizer_scale=np.sqrt(0.50), full_dispersion=True, tuning_rand="lambda.1se"): - - X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) - print("snr", snr) - X -= X.mean(0)[None, :] - X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) - y = y - y.mean() - true_set = np.asarray([u for u in range(p) if beta[u] != 0]) - - if full_dispersion: - dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) - sigma_ = np.sqrt(dispersion) - else: - dispersion = None - sigma_ = np.std(y) - print("estimated and true sigma", sigma, sigma_) - - lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) - _, _, _, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n)) - - if tuning_rand == "lambda.min": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_min * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.1se": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=n * lam_1se * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - elif tuning_rand == "lambda.theory": - randomized_lasso = lasso.gaussian(X, - y, - feature_weights=lam_theory * np.ones(p), - randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) - - else: - raise ValueError('lambda choice not specified correctly') - - signs = randomized_lasso.fit() - nonzero = signs != 0 - sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") - active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) - active_rand_bool = np.asarray([(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], - np.bool) - nreport = 0. - - if nonzero.sum() > 0: - if target == "full": - target_randomized = beta[nonzero] - (observed_target, - cov_target, - cov_target_score, - alternatives) = full_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - elif target == "selected": - target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(randomized_lasso.loglike, - randomized_lasso._W, - nonzero, - dispersion=dispersion) - else: - raise ValueError('not a valid specification of target') - toc = time.time() - MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score, - alternatives) - tic = time.time() - time_MLE = tic - toc - - cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) - length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) - power_MLE = ((active_rand_bool) * ( - np.logical_or((0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) - MLE_discoveries = BHfilter(MLE_pval, q=0.1) - power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(max(MLE_discoveries.sum(), 1.)) - bias_MLE = np.mean(MLE_estimate - target_randomized) - - toc = time.time() - _, sampler_pval, sampler_intervals = randomized_lasso.summary(observed_target, - cov_target, - cov_target_score, - alternatives, - level=0.9, compute_intervals=True, ndraw=100000) - tic = time.time() - time_sampler = tic - toc - - cov_sampler, selective_sampler_power = coverage(sampler_intervals, sampler_pval, target_randomized, beta[nonzero]) - length_sampler = np.mean(sampler_intervals[:, 1] - sampler_intervals[:, 0]) - power_sampler = ((active_rand_bool) * (np.logical_or((0. < sampler_intervals[:, 0]), - (0. > sampler_intervals[:, 1])))).sum() / float((beta != 0).sum()) - sampler_discoveries = BHfilter(sampler_pval, q=0.1) - power_sampler_BH = (sampler_discoveries * active_rand_bool).sum() / float((beta != 0).sum()) - fdr_sampler_BH = (sampler_discoveries * ~active_rand_bool).sum() / float(max(sampler_discoveries.sum(), 1.)) - bias_randLASSO = np.mean(randomized_lasso.initial_soln[nonzero] - target_randomized) - - else: - nreport += 1 - cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power, time_MLE = [0., 0., 0., 0., 0., 0., 0., 0.] - cov_sampler, length_sampler, power_sampler, power_sampler_BH, fdr_sampler_BH, bias_randLASSO, selective_sampler_power, time_sampler = [0., 0., 0., 0., 0., 0., 0., 0.] - MLE_discoveries = np.zeros(1) - sampler_discoveries = np.zeros(1) - - MLE_inf = np.vstack((cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, time_MLE, - power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) - - sampler_inf = np.vstack((cov_sampler, length_sampler, 0., nonzero.sum(), bias_randLASSO, selective_sampler_power, time_sampler, - power_sampler, power_sampler_BH, fdr_sampler_BH, sampler_discoveries.sum())) - - return np.vstack((MLE_inf, sampler_inf, nreport)) - - - - - - - - - diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index d8be08029..1acb23281 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -49,8 +49,8 @@ def __init__(self, logdens_linear = query.sampler.logdens_transform[0] result, self.inverse_info, log_ref = query.selective_MLE(observed_target, - cov_target, - cov_target_score) + cov_target, + cov_target_score) ### Note for an informative prior we might want to change this... @@ -97,7 +97,7 @@ def log_posterior(self, sigmasq = sigma**2 mean_marginal = self.linear_coef.dot(target_parameter) + self.offset_coef - prec_marginal = np.linalg.inv(self.cov_marginal) + prec_marginal = self.prec_marginal conjugate_marginal = prec_marginal.dot(mean_marginal) useC = True @@ -115,8 +115,8 @@ def log_posterior(self, log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal)/2. - log_lik = -((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2.\ - - log_normalizer + log_lik = -(((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2. + - log_normalizer) grad_lik = (self.prec_target.dot(self.observed_target) - self.prec_target.dot(target_parameter) \ @@ -124,9 +124,8 @@ def log_posterior(self, grad_prior, log_prior = self.prior(target_parameter) - return (self.dispersion * grad_lik/sigmasq + grad_prior, - self.dispersion * log_lik/sigmasq + log_prior - - (self.dispersion* self.log_ref/sigmasq)) + return (self.dispersion * (log_lik - self.log_ref) / sigmasq + log_prior, + self.dispersion * grad_lik/sigmasq + grad_prior) ### Private method @@ -140,21 +139,22 @@ def _set_marginal_parameters(self): target_linear = -self.logdens_linear.dot(self.cov_target_score.T.dot(self.prec_target)) implied_precision = np.zeros((self.ntarget + self.nopt, self.ntarget + self.nopt)) - implied_precision[:self.ntarget, :self.ntarget] = (self.prec_target + + implied_precision[:self.ntarget][:,:self.ntarget] = (self.prec_target + target_linear.T.dot(self.cond_precision.dot(target_linear))) - implied_precision[:self.ntarget, self.ntarget:] = -target_linear.T.dot(self.cond_precision) - implied_precision[self.ntarget:, :self.ntarget] = (-target_linear.T.dot(self.cond_precision)).T - implied_precision[self.ntarget:, self.ntarget:] = self.cond_precision + implied_precision[:self.ntarget][:,self.ntarget:] = -target_linear.T.dot(self.cond_precision) + implied_precision[self.ntarget:][:,:self.ntarget] = (-target_linear.T.dot(self.cond_precision)).T + implied_precision[self.ntarget:][:,self.ntarget:] = self.cond_precision implied_cov = np.linalg.inv(implied_precision) - self.linear_coef = implied_cov[self.ntarget:, :self.ntarget].dot(self.prec_target) + self.linear_coef = implied_cov[self.ntarget:][:,:self.ntarget].dot(self.prec_target) target_offset = self.cond_mean - target_linear.dot(self.observed_target) - M = implied_cov[self.ntarget:, self.ntarget:].dot(self.cond_precision.dot(target_offset)) + M = implied_cov[self.ntarget:][:,self.ntarget:].dot(self.cond_precision.dot(target_offset)) N = -target_linear.T.dot(self.cond_precision).dot(target_offset) - self.offset_coef = implied_cov[self.ntarget:, :self.ntarget].dot(N) + M + self.offset_coef = implied_cov[self.ntarget:][:,:self.ntarget].dot(N) + M - self.cov_marginal = implied_cov[self.ntarget:, self.ntarget:] + self.cov_marginal = implied_cov[self.ntarget:][:,self.ntarget:] + self.prec_marginal = np.linalg.inv(self.cov_marginal) ### sampling methods @@ -214,7 +214,7 @@ def gibbs_sampler(selective_posterior, scale_update_sq = invgamma.rvs(a=(0.1 + selective_posterior.ntarget + selective_posterior.ntarget/2), - scale=0.1-((scale_update**2) * sampler.grad_posterior[1]), + scale=0.1-((scale_update**2)*sampler.posterior_[0]), size=1) scale_samples[i] = np.sqrt(scale_update_sq) sampler.scaling = np.sqrt(scale_update_sq) @@ -252,11 +252,11 @@ def next(self): def __next__(self): while True: - self.grad_posterior = self.gradient_map(self.state, self.scaling) - candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.grad_posterior[0]) + self.posterior_ = self.gradient_map(self.state, self.scaling) + candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) + np.sqrt(2.)* (self.proposal_sqrt.dot(self._noise.rvs(self._shape))) * self._sqrt_step) - if not np.all(np.isfinite(self.gradient_map(candidate)[0])): + if not np.all(np.isfinite(self.gradient_map(candidate, self.scaling)[1])): self.stepsize *= 0.5 self._sqrt_step = np.sqrt(self.stepsize) else: diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 1185adebb..40f506773 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -128,11 +128,18 @@ def summary(self, Parameters ---------- - target : one of ['selected', 'full'] + observed_target : ndarray + Observed estimate of target. + + target_cov : ndarray + Estimated covaraince of target. - features : np.bool - Binary encoding of which features to use in final - model and targets. + target_score_cov : ndarray + Estimated covariance of target and score of randomized query. + + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] parameter : np.array Hypothesized value for parameter -- defaults to 0. @@ -288,10 +295,12 @@ def posterior(self, print('Using dispersion parameter 1...') if prior is None: + Di = 1. / (200 * np.diag(target_cov)) def prior(target_parameter): - grad_prior = -target_parameter / 100 - log_prior = -np.linalg.norm(target_parameter)**2 /(2. * 100) - return grad_prior, log_prior + grad_prior = -target_parameter * Di + log_prior = -0.5 * np.sum(target_parameter**2 * Di) + stop + return log_prior, grad_prior return posterior(self, observed_target, @@ -305,24 +314,28 @@ def approximate_grid_inference(self, observed_target, target_cov, target_score_cov, - grid = None, - dispersion=None, + grid=None, + alternatives=None, solve_args={'tol': 1.e-12}): - if dispersion is None: - dispersion = 1 - print('Using dispersion parameter 1...') - - if grid is None: - grid = np.linspace(- 20., 20., num=401) - - return approximate_grid_inference(self, - observed_target, - target_cov, - target_score_cov, - grid, - dispersion, - solve_args=solve_args) + # result, inverse_info = self.selective_MLE(observed_target, + # target_cov, + # target_score_cov)[:2] + + # if dispersion is None: + # dispersion = 1 + # print('Using dispersion parameter 1...') + + G = approximate_grid_inference(self, + observed_target, + target_cov, + target_score_cov, + #inverse_info, + #result['MLE'], + #dispersion, + grid=grid, + solve_args=solve_args) + return G.summary(alternatives=alternatives) class gaussian_query(query): From 56f9ba6e3562eefc9cb38201512fff2d00b3ef5a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 12:16:42 -0700 Subject: [PATCH 059/187] BF: changing prior as well --- selectinf/randomized/query.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 40f506773..0b24ecc45 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -280,7 +280,7 @@ def posterior(self, prior : callable A callable object that takes a single argument `parameter` of the same shape as `observed_target` - and returns (gradient of log prior, value of log prior) + and returns (value of log prior, gradient of log prior) dispersion : float, optional Dispersion parameter for log-likelihood. @@ -299,7 +299,6 @@ def posterior(self, def prior(target_parameter): grad_prior = -target_parameter * Di log_prior = -0.5 * np.sum(target_parameter**2 * Di) - stop return log_prior, grad_prior return posterior(self, From ae211d7a91591e2eebd047a0b822885c8c2cf3eb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 12:22:53 -0700 Subject: [PATCH 060/187] BF: two more priors needed changing --- selectinf/randomized/tests/test_posterior.py | 134 +++++++------------ 1 file changed, 49 insertions(+), 85 deletions(-) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index bba39845a..6e79e44e6 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -3,7 +3,7 @@ import statsmodels.api as sm from scipy.stats import norm as ndist -from ...tests.instance import gaussian_instance +from ...tests.instance import gaussian_instance, HIV_NRTI from ..lasso import lasso, selected_targets, split_lasso from ..posterior_inference import (langevin_sampler, gibbs_sampler) @@ -153,7 +153,7 @@ def test_flexible_prior1(nsample=100, nburnin=50): def prior(target_parameter): grad_prior = -target_parameter / 100 log_prior = -np.linalg.norm(target_parameter)**2 /(2. * 100) - return grad_prior, log_prior + return log_prior, grad_prior seed_state = np.random.get_state() np.random.set_state(seed_state) @@ -212,7 +212,7 @@ def test_flexible_prior2(nsample=1000, nburnin=50): def prior(target_parameter): grad_prior = -target_parameter / prior_var log_prior = -np.linalg.norm(target_parameter)**2 /(2. * prior_var) - return grad_prior, log_prior + return log_prior, grad_prior posterior_inf = L.posterior(observed_target, cov_target, @@ -227,50 +227,22 @@ def prior(target_parameter): nburnin=nburnin) return samples - -def test_hiv_data(nsample=1000, - nburnin=100, - alpha =0.10, +def test_hiv_data(nsample=10000, + nburnin=500, + level=0.90, split_proportion=0.50, seedn = 1): np.random.seed(seedn) - level = 1 - alpha / 2. - Z_quantile = ndist.ppf(level) - - NRTI = pd.read_csv("http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", - na_values="NA", sep='\t') - - NRTI_specific = [] - NRTI_muts = [] - - for i in range(1, 241): - d = NRTI['P%d' % i] - for mut in np.unique(d): - if mut not in ['-', '.'] and len(mut) == 1: - test = np.equal(d, mut) - if test.sum() > 10: - NRTI_specific.append(np.array(np.equal(d, mut))) - NRTI_muts.append("P%d%s" % (i, mut)) - - NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) - - X_NRTI = np.array(NRTI_specific, np.float) - Y = NRTI['3TC'] # shorthand - keep = ~np.isnan(Y).astype(np.bool) - X_NRTI = X_NRTI[np.nonzero(keep)] - - Y = Y[keep] - Y = np.array(np.log(Y), np.float) - Y -= Y.mean() - X_NRTI -= X_NRTI.mean(0)[None, :] - X_NRTI /= X_NRTI.std(0)[None, :] - X = X_NRTI + alpha = (1 - level) / 2 + Z_quantile = ndist.ppf(1 - alpha) + + X, Y, _ = HIV_NRTI(standardize=True) + Y *= 15 n, p = X.shape X /= np.sqrt(n) - - + ols_fit = sm.OLS(Y, X).fit() _sigma = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) @@ -296,6 +268,15 @@ def test_hiv_data(nsample=1000, nonzero, dispersion=dispersion) + mle, inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score, + level=level, + solve_args={'tol':1.e-12})[:2] + + # approx_inf = conv.approximate_grid_inference(observed_target, + # cov_target, + # cov_target_score) posterior_inf = conv.posterior(observed_target, cov_target, @@ -307,75 +288,58 @@ def test_hiv_data(nsample=1000, nburnin=nburnin, step=1.) - lci_langevin = np.percentile(samples_langevin, int((1-level)*100), axis=0) - uci_langevin = np.percentile(samples_langevin, int((level)*100), axis=0) + lower_langevin = np.percentile(samples_langevin, int(alpha*100), axis=0) + upper_langevin = np.percentile(samples_langevin, int((1-alpha)*100), axis=0) - samples_gibbs = gibbs_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin)[0] + samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin) - lci_gibbs = np.percentile(samples_gibbs, int((1 - level) * 100), axis=0) - uci_gibbs = np.percentile(samples_gibbs, int((level) * 100), axis=0) + lower_gibbs = np.percentile(samples_gibbs, int(alpha* 100), axis=0) + upper_gibbs = np.percentile(samples_gibbs, int((1-alpha)*100), axis=0) naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) - naive_cov = _sigma * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) + naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T X_split = X[~conv._selection_idx, :] Y_split = Y[~conv._selection_idx] split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) - split_cov = _sigma * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) + split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T - print("lengths: adjusted intervals Langevin, Gibbs, MLE ", np.mean(uci_langevin - lci_langevin), np.mean(uci_gibbs - lci_gibbs), - np.mean((2* Z_quantile )* np.sqrt(np.diag(posterior_inf.inverse_info)))) + print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", + np.mean(upper_langevin - lower_langevin), + np.mean(upper_gibbs - lower_gibbs), + np.mean((2*Z_quantile)*np.sqrt(np.diag(posterior_inf.inverse_info))), + np.mean(mle['upper_confidence'] - mle['lower_confidence']), + #np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) + ) print("lengths: naive intervals ", np.mean(naive_intervals[:,1]-naive_intervals[:,0])) print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) - output = pd.DataFrame({'Langevin_lower_confidence': lci_langevin, - 'Langevin_upper_confidence': uci_langevin, - 'Gibbs_lower_confidence': lci_gibbs, - 'Gibbs_upper_confidence': uci_gibbs, + scale_interval = np.percentile(scale_gibbs, [alpha*100, (1-alpha)*100]) + output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, + 'Langevin_upper_credible': upper_langevin, + 'Gibbs_lower_credible': lower_gibbs, + 'Gibbs_upper_credible': upper_gibbs, + 'MLE_lower_confidence': mle['lower_confidence'], + 'MLE_upper_confidence': mle['upper_confidence'], + #'approx_lower_confidence': approx_inf['lower_confidence'], + #'approx_upper_confidence': approx_inf['upper_confidence'], 'Split_lower_confidence': split_intervals[:,0], 'Split_upper_confidence': split_intervals[:, 1], 'Naive_lower_confidence': naive_intervals[:, 0], 'Naive_upper_confidence': naive_intervals[:, 1] }) - return output - -# def main(ndraw=10): -# -# coverage_ = 0. -# length_ = 0. -# for n in range(ndraw): -# cov, len = test_Langevin(n=500, -# p=200, -# signal_fac=1., -# s=5, -# sigma=3., -# rho=0.2, -# randomizer_scale=1. -# ) -# -# # cov, len = test_instance(nsample=2000, -# # nburnin=100) -# -# coverage_ += cov -# length_ += len -# -# print("coverage so far ", coverage_ / (n + 1.)) -# print("lengths so far ", length_ / (n + 1.)) -# print("iteration completed ", n + 1) - - -def main(): - test_hiv_data(split_proportion=0.50) + return output, scale_interval, _sigma if __name__ == "__main__": - main() + test_hiv_data(split_proportion=0.50) + From 93bfded0bfd4cfbbb46e0f4f900dfacd7deb022f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 12:25:56 -0700 Subject: [PATCH 061/187] matching default prior --- selectinf/randomized/tests/test_posterior.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 6e79e44e6..01d2b6769 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -150,9 +150,10 @@ def test_flexible_prior1(nsample=100, nburnin=50): M, dispersion=dispersion) + Di = 1. / (200 * np.diag(cov_target)) def prior(target_parameter): - grad_prior = -target_parameter / 100 - log_prior = -np.linalg.norm(target_parameter)**2 /(2. * 100) + grad_prior = -target_parameter * Di + log_prior = -np.sum(target_parameter**2 * Di) return log_prior, grad_prior seed_state = np.random.get_state() From c8ed7cbcde7b463d4f059189ce88da3576489a28 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:14:51 -0700 Subject: [PATCH 062/187] using discrete family for approximate grid inference --- selectinf/randomized/approx_reference.py | 274 ++++++++++++++---- selectinf/randomized/posterior_inference.py | 4 +- .../randomized/tests/test_approx_reference.py | 91 ++++-- selectinf/randomized/tests/test_posterior.py | 12 +- 4 files changed, 294 insertions(+), 87 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 3c1b1b8fa..b68e3de0b 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -1,23 +1,57 @@ from __future__ import division, print_function -import numpy as np, sys -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +import numpy as np, pandas as pd +from scipy.interpolate import interp1d +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from ..distributions.discrete_family import discrete_family -class approximate_grid_inference(): +class approximate_grid_inference(object): def __init__(self, query, observed_target, cov_target, cov_target_score, - grid, - dispersion=1, - level=0.9, + grid=None, solve_args={'tol':1.e-12}): + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + observed_target : ndarray + Observed estimate of target. + + target_cov : ndarray + Estimated covaraince of target. + + target_score_cov : ndarray + Estimated covariance of target and score of randomized query. + + grid : ndarray + Grid on which to evaluate the approximate + probability of selection. + + mle : ndarray + Selective MLE as initial guess. + + inverse_info : ndarray + Selective inverse information to guide grid search. + + """ + self.solve_args = solve_args + result, inverse_info = query.selective_MLE(observed_target, + cov_target, + cov_target_score, + solve_args=solve_args)[:2] + mle = result['MLE'] + self.linear_part = query.sampler.affine_con.linear_part self.offset = query.sampler.affine_con.offset @@ -31,28 +65,47 @@ def __init__(self, self.cov_target = cov_target self.init_soln = query.observed_opt_state - self.grid = grid - self.ntarget = cov_target.shape[0] - self.level = level + self.ntarget = ntarget = cov_target.shape[0] + _scale = 4 * np.sqrt(np.diag(inverse_info)) + ngrid = 40 + + scale_ = 4 * np.max(np.sqrt(np.diag(inverse_info))) + + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j,:] = np.linspace(observed_target[j] - 1.5*_scale[j], + observed_target[j] + 1.5*_scale[j], + num=ngrid) + - def approx_log_reference(self, + def _approx_log_reference(self, observed_target, cov_target, - cov_target_score): + cov_target_score, + grid): + """ + Approximate the log of the reference density on a grid. + + """ if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - observed_target = np.atleast_1d(observed_target) prec_target = np.linalg.inv(cov_target) target_lin = - self.logdens_linear.dot(cov_target_score.T.dot(prec_target)) ref_hat = [] solver = solve_barrier_affine_C - for k in range(self.grid.shape[0]): - cond_mean_grid = target_lin.dot(np.asarray([self.grid[k]])) + ( - self.cond_mean - target_lin.dot(observed_target)) + for k in range(grid.shape[0]): + # in the usual D = N + Gamma theta.hat, + # target_lin is "something" times Gamma, + # where "something" comes from implied Gaussian + # cond_mean is "something" times D + # Gamma is cov_target_score.T.dot(prec_target) + + cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + self.cond_mean) conjugate_arg = self.prec_opt.dot(cond_mean_grid) val, _, _ = solver(conjugate_arg, @@ -66,35 +119,40 @@ def approx_log_reference(self, return np.asarray(ref_hat) - - def approx_density(self, - mean_parameter, - cov_target, - approx_log_ref): + def approx_CDF(self, + mean_parameter, + cov_target, + approx_log_ref, + grid): _approx_density = [] - for k in range(self.grid.shape[0]): - _approx_density.append(np.exp(-np.true_divide((self.grid[k] - mean_parameter) ** 2, 2 * cov_target) + approx_log_ref[k])) + for k in range(grid.shape[0]): + # approx_log_ref[k] = P(selection | D = N + Gamma * grid[k]) + _approx_density.append(np.exp(-np.true_divide((grid[k] - mean_parameter)**2, + 2 * cov_target) + approx_log_ref[k])) _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) return np.cumsum(_approx_density_) def approx_ci(self, param_grid, + stat_grid, cov_target, approx_log_ref, - indx_obsv): + indx_obsv, + level): area = np.zeros(param_grid.shape[0]) for k in range(param_grid.shape[0]): - area_vec = self.approx_density(param_grid[k], - cov_target, - approx_log_ref) + area_vec = self.approx_CDF(param_grid[k], + cov_target, + approx_log_ref, + stat_grid) area[k] = area_vec[indx_obsv] - alpha = 1 - self.level + alpha = 1 - level region = param_grid[(area >= alpha / 2.) & (area <= (1 - alpha / 2.))] if region.size > 0: @@ -102,61 +160,155 @@ def approx_ci(self, else: return 0., 0. - def approx_pivot(self, - mean_parameter): - - pivot = [] + def _construct_families(self): + self._families = [] for m in range(self.ntarget): p = self.cov_target_score.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + var_target = cov_target_uni[0, 0] cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) - approx_log_ref = self.approx_log_reference(observed_target_uni, - cov_target_uni, - cov_target_score_uni) + approx_log_ref = self._approx_log_reference(observed_target_uni, + cov_target_uni, + cov_target_score_uni, + self.stat_grid[m]) + + approx_fn = interp1d(self.stat_grid[m], + approx_log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') + + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m])**2 / var_target) + logW -= logW.max() + + self._families.append(discrete_family(grid, + np.exp(logW))) + + logG = - 0.5 * grid**2 / var_target + logG -= logG.max() + import matplotlib.pyplot as plt + + # plt.plot(self.stat_grid[m][10:30], approx_log_ref[10:30]) + # plt.plot(self.stat_grid[m][:10], approx_log_ref[:10], 'r', linewidth=4) + # plt.plot(self.stat_grid[m][30:], approx_log_ref[30:], 'r', linewidth=4) + # plt.plot(self.stat_grid[m]*1.5, fapprox(self.stat_grid[m]*1.5), 'k--') + # plt.show() + + # plt.plot(grid, logW) + # plt.plot(grid, logG) + + # stop + + def approx_pivots(self, + mean_parameter, + alternatives=None): + + if not hasattr(self, "_families"): + self._construct_families() + + if alternatives is None: + alternatives = ['twosided'] * self.ntarget - area_cum = self.approx_density(mean_parameter[m], - cov_target_uni, - approx_log_ref) - - pivot.append(2 * np.minimum(area_cum[grid_indx_obs], 1. - area_cum[grid_indx_obs])) - - sys.stderr.write("variable completed " + str(m + 1) + "\n") + pivot = [] + for m in range(self.ntarget): + family = self._families[m] + observed_target = self.observed_target[m] + var_target = self.cov_target[m, m] + _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, + x=observed_target) + if alternatives[m] == 'twosided': + pivot.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pivot.append(1 - _cdf) + elif alternatives[m] == 'less': + pivot.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') return pivot def approx_intervals(self, - param_grid): + level=0.9): - intervals_lci =[] - intervals_uci =[] + if not hasattr(self, "_families"): + self._construct_families() + + lower, upper = [], [] for m in range(self.ntarget): - p = self.cov_target_score.shape[1] - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - grid_indx_obs = np.argmin(np.abs(self.grid - observed_target_uni)) + family = self._families[m] + observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, + alpha=1-level) + var_target = self.cov_target[m, m] + lower.append(l * var_target + observed_target) + upper.append(u * var_target + observed_target) + + return np.asarray(lower), np.asarray(upper) + + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + + level : float + Confidence level. + + """ + + if parameter is not None: + pivots = self.approx_pivots(parameter, + alternatives=alternatives) + else: + pivots = None - approx_log_ref = self.approx_log_reference(observed_target_uni, - cov_target_uni, - cov_target_score_uni) + pvalues = self.approx_pivots(np.zeros_like(self.observed_target), + alternatives=alternatives) + lower, upper = self.approx_intervals(level=level) - approx_lci, approx_uci = self.approx_ci(param_grid[m,:], - cov_target_uni, - approx_log_ref, - grid_indx_obs) + result = pd.DataFrame({'target':self.observed_target, + 'pvalue':pvalues, + 'lower_confidence':lower, + 'upper_confidence':upper}) - intervals_lci.append(approx_lci) - intervals_uci.append(approx_uci) + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) - sys.stderr.write("variable completed " + str(m + 1) + "\n") + return result - return np.asarray(intervals_lci), np.asarray(intervals_uci) +def _log_concave_approx(xval, yval): + """ + Approximate a log-concave function + to full line based on sample. + Assumes `xval` is sorted + """ + nu, nl = 10, 10 + n = xval.shape[0] + D = np.vstack([np.ones(n), xval, xval**2]).T + Du = D[-nu:] + Qu = np.linalg(Du).dot(yval[-nu:]) + Dl = D[:nl] + Ql = np.linalg(Dl).dot(yval[:nl]) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 1acb23281..403a5a1f0 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -24,7 +24,7 @@ class posterior(object): prior : callable A callable object that takes a single argument `parameter` of the same shape as `observed_target` - and returns (gradient of log prior, value of log prior) + and returns (value of log prior, gradient of log prior) dispersion : float, optional A dispersion parameter for likelihood. @@ -122,7 +122,7 @@ def log_posterior(self, self.prec_target.dot(target_parameter) \ - self.linear_coef.T.dot(prec_marginal.dot(soln)- conjugate_marginal)) - grad_prior, log_prior = self.prior(target_parameter) + log_prior, grad_prior = self.prior(target_parameter) return (self.dispersion * (log_lik - self.log_ref) / sigmasq + log_prior, self.dispersion * grad_lik/sigmasq + grad_prior) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index c394559d6..5045e46d5 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -2,7 +2,58 @@ from ...tests.instance import gaussian_instance from ..lasso import lasso, selected_targets +from ..approx_reference import approximate_grid_inference +def test_summary(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.): + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * dispersion) + + signs = conv.fit() + nonzero = signs != 0 + + if nonzero.sum()>0: + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + S = conv.approximate_grid_inference(observed_target, + cov_target, + cov_target_score, + alternatives=alternatives) def test_approx_pivot(n=500, p=100, @@ -59,13 +110,13 @@ def test_approx_pivot(n=500, grid = np.linspace(- scale_, scale_, num=ngrid) - approximate_grid_inf = conv.approximate_grid_inference(observed_target, - cov_target, - cov_target_score, - grid=grid, - dispersion=dispersion) + approximate_grid_inf = approximate_grid_inference(conv, + observed_target, + cov_target, + cov_target_score, + grid=grid) - pivot = approximate_grid_inf.approx_pivot(beta_target) + pivot = approximate_grid_inf.approx_pivots(beta_target) return pivot @@ -76,7 +127,8 @@ def test_approx_ci(n=500, s=5, sigma=2., rho=0.4, - randomizer_scale=1.): + randomizer_scale=1., + level=0.9): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -126,19 +178,22 @@ def test_approx_ci(n=500, grid = np.linspace(-scale_, scale_, num=ngrid) - approximate_grid_inf = conv.approximate_grid_inference(observed_target, - cov_target, - cov_target_score, - grid=grid, - dispersion=dispersion) - + approximate_grid_inf = approximate_grid_inference(conv, + observed_target, + cov_target, + cov_target_score, + grid=grid) param_grid = np.zeros((ntarget, ngrid)) mle = np.asarray(result['MLE']) for j in range(ntarget): param_grid[j,:] = np.linspace(mle[j]-_scale[j], mle[j]+_scale[j], num=ngrid) - lci, uci = approximate_grid_inf.approx_intervals(param_grid) + lci, uci = approximate_grid_inf.approx_intervals(level) + + S = conv.approximate_grid_inference(observed_target, + cov_target, + cov_target_score) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (lci < beta_target) * (uci > beta_target) @@ -146,18 +201,18 @@ def test_approx_ci(n=500, return np.mean(coverage), np.mean(length) -import matplotlib.pyplot as plt -from statsmodels.distributions.empirical_distribution import ECDF def main(nsim=300, CI = False): + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF if CI is False: _pivot = [] for i in range(nsim): _pivot.extend(test_approx_pivot(n=200, p=100, - signal_fac=0.5, + signal_fac=1., s=5, sigma=3., rho=0.20, @@ -191,4 +246,4 @@ def main(nsim=300, CI = False): print("iteration completed ", n + 1) if __name__ == "__main__": - main(nsim=20, CI = True) \ No newline at end of file + main(nsim=20, CI = True) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 01d2b6769..2b93b0422 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -275,9 +275,9 @@ def test_hiv_data(nsample=10000, level=level, solve_args={'tol':1.e-12})[:2] - # approx_inf = conv.approximate_grid_inference(observed_target, - # cov_target, - # cov_target_score) + approx_inf = conv.approximate_grid_inference(observed_target, + cov_target, + cov_target_score) posterior_inf = conv.posterior(observed_target, cov_target, @@ -316,7 +316,7 @@ def test_hiv_data(nsample=10000, np.mean(upper_gibbs - lower_gibbs), np.mean((2*Z_quantile)*np.sqrt(np.diag(posterior_inf.inverse_info))), np.mean(mle['upper_confidence'] - mle['lower_confidence']), - #np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) + np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) ) print("lengths: naive intervals ", np.mean(naive_intervals[:,1]-naive_intervals[:,0])) @@ -330,8 +330,8 @@ def test_hiv_data(nsample=10000, 'Gibbs_upper_credible': upper_gibbs, 'MLE_lower_confidence': mle['lower_confidence'], 'MLE_upper_confidence': mle['upper_confidence'], - #'approx_lower_confidence': approx_inf['lower_confidence'], - #'approx_upper_confidence': approx_inf['upper_confidence'], + 'approx_lower_confidence': approx_inf['lower_confidence'], + 'approx_upper_confidence': approx_inf['upper_confidence'], 'Split_lower_confidence': split_intervals[:,0], 'Split_upper_confidence': split_intervals[:, 1], 'Naive_lower_confidence': naive_intervals[:, 0], From 28fcb50f6c9ccd4785c802337ad5c35a809e04d9 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:41:25 -0700 Subject: [PATCH 063/187] code cleanup for readability --- selectinf/distributions/discrete_family.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/selectinf/distributions/discrete_family.py b/selectinf/distributions/discrete_family.py index 5c6e6fc23..6bdf10f55 100644 --- a/selectinf/distributions/discrete_family.py +++ b/selectinf/distributions/discrete_family.py @@ -83,7 +83,7 @@ def __init__(self, sufficient_stat, weights, theta=0.): xw = np.array(sorted(zip(sufficient_stat, weights)), np.float) self._x = xw[:,0] self._w = xw[:,1] - self._lw = np.array([np.log(v) for v in xw[:,1]]) + self._lw = np.log(xw[:,1]) self._w /= self._w.sum() # make sure they are a pmf self.n = len(xw) self._theta = np.nan @@ -479,7 +479,12 @@ def interval(self, observed, alpha=0.05, randomize=True, auxVar=None, tol=1e-6): lower = self._inter2Lower(observed, 0., alpha, tol) return lower, upper - def equal_tailed_interval(self, observed, alpha=0.05, randomize=True, auxVar=None, tol=1e-6): + def equal_tailed_interval(self, + observed, + alpha=0.05, + randomize=True, + auxVar=None, + tol=1e-6): """ Form interval by inverting equal-tailed test with $\alpha/2$ in each tail. From 7042db4706c39e7f378819a92ac90df73005d291 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:42:03 -0700 Subject: [PATCH 064/187] rename selective MLE method due to signature conflict --- selectinf/randomized/drop_losers.py | 2 +- selectinf/randomized/tests/test_drop_losers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/drop_losers.py b/selectinf/randomized/drop_losers.py index ffe2804ca..7c2a7bce6 100644 --- a/selectinf/randomized/drop_losers.py +++ b/selectinf/randomized/drop_losers.py @@ -66,7 +66,7 @@ def __init__(self, self._setup_sampler(A, b, linear, offset) - def selective_MLE(self, + def MLE_inference(self, level=0.9, solve_args={'tol':1.e-12}): """ diff --git a/selectinf/randomized/tests/test_drop_losers.py b/selectinf/randomized/tests/test_drop_losers.py index 46f4b8395..45bd3595d 100644 --- a/selectinf/randomized/tests/test_drop_losers.py +++ b/selectinf/randomized/tests/test_drop_losers.py @@ -39,11 +39,11 @@ def test_drop_losers(p=50, dtl = drop_losers(df, K=K) - dtl.selective_MLE() + dtl.MLE_inference() if not use_MLE: result = dtl.summary(ndraw=20000, burnin=5000) else: - result = dtl.selective_MLE()[0] + result = dtl.MLE_inference()[0] pvalue = np.asarray(result['pvalue']) lower = np.asarray(result['lower_confidence']) upper = np.asarray(result['upper_confidence']) From 1da37cf61273cd5ed662a64914a46e9cb6b41388 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:42:26 -0700 Subject: [PATCH 065/187] unused grid arguments --- .../randomized/tests/test_approx_reference.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 5045e46d5..fb1d94828 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -105,16 +105,10 @@ def test_approx_pivot(n=500, cov_target, cov_target_score)[1] - scale_ = 4 * np.max(np.sqrt(np.diag(inverse_info))) - ngrid = 2 * scale_/0.1 - - grid = np.linspace(- scale_, scale_, num=ngrid) - approximate_grid_inf = approximate_grid_inference(conv, observed_target, cov_target, - cov_target_score, - grid=grid) + cov_target_score) pivot = approximate_grid_inf.approx_pivots(beta_target) @@ -176,18 +170,10 @@ def test_approx_ci(n=500, scale_ = np.max(_scale) ngrid = int(2 * scale_/0.1) - grid = np.linspace(-scale_, scale_, num=ngrid) - approximate_grid_inf = approximate_grid_inference(conv, observed_target, cov_target, - cov_target_score, - grid=grid) - - param_grid = np.zeros((ntarget, ngrid)) - mle = np.asarray(result['MLE']) - for j in range(ntarget): - param_grid[j,:] = np.linspace(mle[j]-_scale[j], mle[j]+_scale[j], num=ngrid) + cov_target_score) lci, uci = approximate_grid_inf.approx_intervals(level) From 283026212042f53a181f87e6cc6f9bc8054ecc63 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:44:21 -0700 Subject: [PATCH 066/187] moved gaussian query specific methods to that class --- selectinf/randomized/approx_reference.py | 64 +++---- selectinf/randomized/query.py | 230 +++++++++++------------ 2 files changed, 143 insertions(+), 151 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index b68e3de0b..abee9bfbc 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -11,9 +11,8 @@ class approximate_grid_inference(object): def __init__(self, query, observed_target, - cov_target, - cov_target_score, - grid=None, + target_cov, + target_score_cov, solve_args={'tol':1.e-12}): """ @@ -23,6 +22,10 @@ def __init__(self, Parameters ---------- + query : `gaussian_query` + A Gaussian query which has information + to describe implied Gaussian. + observed_target : ndarray Observed estimate of target. @@ -32,23 +35,16 @@ def __init__(self, target_score_cov : ndarray Estimated covariance of target and score of randomized query. - grid : ndarray - Grid on which to evaluate the approximate - probability of selection. - - mle : ndarray - Selective MLE as initial guess. - - inverse_info : ndarray - Selective inverse information to guide grid search. + solve_args : dict, optional + Arguments passed to solver. """ self.solve_args = solve_args result, inverse_info = query.selective_MLE(observed_target, - cov_target, - cov_target_score, + target_cov, + target_score_cov, solve_args=solve_args)[:2] mle = result['MLE'] @@ -61,12 +57,12 @@ def __init__(self, self.cond_cov = query.cond_cov self.observed_target = observed_target - self.cov_target_score = cov_target_score - self.cov_target = cov_target + self.target_score_cov = target_score_cov + self.target_cov = target_cov self.init_soln = query.observed_opt_state - self.ntarget = ntarget = cov_target.shape[0] + self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) ngrid = 40 @@ -81,8 +77,8 @@ def __init__(self, def _approx_log_reference(self, observed_target, - cov_target, - cov_target_score, + target_cov, + target_score_cov, grid): """ @@ -92,8 +88,8 @@ def _approx_log_reference(self, if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(cov_target) - target_lin = - self.logdens_linear.dot(cov_target_score.T.dot(prec_target)) + prec_target = np.linalg.inv(target_cov) + target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] solver = solve_barrier_affine_C @@ -102,7 +98,7 @@ def _approx_log_reference(self, # target_lin is "something" times Gamma, # where "something" comes from implied Gaussian # cond_mean is "something" times D - # Gamma is cov_target_score.T.dot(prec_target) + # Gamma is target_score_cov.T.dot(prec_target) cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) @@ -121,7 +117,7 @@ def _approx_log_reference(self, def approx_CDF(self, mean_parameter, - cov_target, + target_cov, approx_log_ref, grid): @@ -129,7 +125,7 @@ def approx_CDF(self, for k in range(grid.shape[0]): # approx_log_ref[k] = P(selection | D = N + Gamma * grid[k]) _approx_density.append(np.exp(-np.true_divide((grid[k] - mean_parameter)**2, - 2 * cov_target) + approx_log_ref[k])) + 2 * target_cov) + approx_log_ref[k])) _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) return np.cumsum(_approx_density_) @@ -137,7 +133,7 @@ def approx_CDF(self, def approx_ci(self, param_grid, stat_grid, - cov_target, + target_cov, approx_log_ref, indx_obsv, level): @@ -146,7 +142,7 @@ def approx_ci(self, for k in range(param_grid.shape[0]): area_vec = self.approx_CDF(param_grid[k], - cov_target, + target_cov, approx_log_ref, stat_grid) @@ -164,15 +160,15 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.cov_target_score.shape[1] + p = self.target_score_cov.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - var_target = cov_target_uni[0, 0] - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + var_target = target_cov_uni[0, 0] + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) approx_log_ref = self._approx_log_reference(observed_target_uni, - cov_target_uni, - cov_target_score_uni, + target_cov_uni, + target_score_cov_uni, self.stat_grid[m]) approx_fn = interp1d(self.stat_grid[m], @@ -219,7 +215,7 @@ def approx_pivots(self, for m in range(self.ntarget): family = self._families[m] observed_target = self.observed_target[m] - var_target = self.cov_target[m, m] + var_target = self.target_cov[m, m] _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, x=observed_target) if alternatives[m] == 'twosided': @@ -245,7 +241,7 @@ def approx_intervals(self, observed_target = self.observed_target[m] l, u = family.equal_tailed_interval(observed_target, alpha=1-level) - var_target = self.cov_target[m, m] + var_target = self.target_cov[m, m] lower.append(l * var_target + observed_target) upper.append(u * var_target + observed_target) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 0b24ecc45..df890a2ef 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -97,17 +97,97 @@ def solve(self): raise NotImplementedError('abstract method') - def setup_sampler(self): - """ - Setup query to prepare for sampling. - Should set a few key attributes: +class gaussian_query(query): - - observed_score_state - - observed_opt_state - - opt_transform + useC = True - """ - raise NotImplementedError('abstract method -- only keyword arguments') + """ + A class with Gaussian perturbation to the objective -- + easy to apply CLT to such things + """ + + def fit(self, perturb=None): + + p = self.nfeature + + # take a new perturbation if supplied + if perturb is not None: + self._initial_omega = perturb + if self._initial_omega is None: + self._initial_omega = self.randomizer.sample() + + # Private methods + + def _setup_sampler(self, + linear_part, + offset, + opt_linear, + opt_offset, + # optional dispersion parameter + # for covariance of randomization + dispersion=1): + + A, b = linear_part, offset + if not np.all(A.dot(self.observed_opt_state) - b <= 0): + raise ValueError('constraints not satisfied') + + (cond_mean, + cond_cov, + cond_precision, + logdens_linear) = self._setup_implied_gaussian(opt_linear, + opt_offset, + dispersion) + + def log_density(logdens_linear, offset, cond_prec, opt, score): + if score.ndim == 1: + mean_term = logdens_linear.dot(score.T + offset).T + else: + mean_term = logdens_linear.dot(score.T + offset[:, None]).T + arg = opt + mean_term + return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) + + log_density = functools.partial(log_density, + logdens_linear, + opt_offset, + cond_precision) + + self.cond_mean, self.cond_cov = cond_mean, cond_cov + + affine_con = constraints(A, + b, + mean=cond_mean, + covariance=cond_cov) + + self.sampler = affine_gaussian_sampler(affine_con, + self.observed_opt_state, + self.observed_score_state, + log_density, + (logdens_linear, opt_offset), + selection_info=self.selection_variable, + useC=self.useC) + + def _setup_implied_gaussian(self, + opt_linear, + opt_offset, + # optional dispersion parameter + # for covariance of randomization + dispersion=1): + + _, prec = self.randomizer.cov_prec + prec = prec / dispersion + + if np.asarray(prec).shape in [(), (0,)]: + cond_precision = opt_linear.T.dot(opt_linear) * prec + cond_cov = np.linalg.inv(cond_precision) + logdens_linear = cond_cov.dot(opt_linear.T) * prec + else: + cond_precision = opt_linear.T.dot(prec.dot(opt_linear)) + cond_cov = np.linalg.inv(cond_precision) + logdens_linear = cond_cov.dot(opt_linear.T).dot(prec) + + cond_mean = -logdens_linear.dot(self.observed_score_state + opt_offset) + + return cond_mean, cond_cov, cond_precision, logdens_linear def summary(self, observed_target, @@ -197,10 +277,9 @@ def summary(self, if compute_intervals: - MLE = query.selective_MLE(self, - observed_target, - target_cov, - target_score_cov)[0] + MLE = self.selective_MLE(observed_target, + target_cov, + target_score_cov)[0] MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) intervals = self.sampler.confidence_intervals( @@ -313,121 +392,38 @@ def approximate_grid_inference(self, observed_target, target_cov, target_score_cov, - grid=None, alternatives=None, solve_args={'tol': 1.e-12}): - # result, inverse_info = self.selective_MLE(observed_target, - # target_cov, - # target_score_cov)[:2] - - # if dispersion is None: - # dispersion = 1 - # print('Using dispersion parameter 1...') - - G = approximate_grid_inference(self, - observed_target, - target_cov, - target_score_cov, - #inverse_info, - #result['MLE'], - #dispersion, - grid=grid, - solve_args=solve_args) - return G.summary(alternatives=alternatives) - - -class gaussian_query(query): - - useC = True - - """ - A class with Gaussian perturbation to the objective -- - easy to apply CLT to such things - """ - - def fit(self, perturb=None): - - p = self.nfeature - - # take a new perturbation if supplied - if perturb is not None: - self._initial_omega = perturb - if self._initial_omega is None: - self._initial_omega = self.randomizer.sample() - - # Private methods - - def _setup_sampler(self, - linear_part, - offset, - opt_linear, - opt_offset, - # optional dispersion parameter - # for covariance of randomization - dispersion=1): - - A, b = linear_part, offset - if not np.all(A.dot(self.observed_opt_state) - b <= 0): - raise ValueError('constraints not satisfied') - - (cond_mean, - cond_cov, - cond_precision, - logdens_linear) = self._setup_implied_gaussian(opt_linear, - opt_offset, - dispersion) - - def log_density(logdens_linear, offset, cond_prec, opt, score): - if score.ndim == 1: - mean_term = logdens_linear.dot(score.T + offset).T - else: - mean_term = logdens_linear.dot(score.T + offset[:, None]).T - arg = opt + mean_term - return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - - log_density = functools.partial(log_density, - logdens_linear, - opt_offset, - cond_precision) + """ - self.cond_mean, self.cond_cov = cond_mean, cond_cov + Parameters + ---------- - affine_con = constraints(A, - b, - mean=cond_mean, - covariance=cond_cov) + observed_target : ndarray + Observed estimate of target. - self.sampler = affine_gaussian_sampler(affine_con, - self.observed_opt_state, - self.observed_score_state, - log_density, - (logdens_linear, opt_offset), - selection_info=self.selection_variable, - useC=self.useC) + target_cov : ndarray + Estimated covaraince of target. - def _setup_implied_gaussian(self, - opt_linear, - opt_offset, - # optional dispersion parameter - # for covariance of randomization - dispersion=1): + target_score_cov : ndarray + Estimated covariance of target and score of randomized query. - _, prec = self.randomizer.cov_prec - prec = prec / dispersion + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] - if np.asarray(prec).shape in [(), (0,)]: - cond_precision = opt_linear.T.dot(opt_linear) * prec - cond_cov = np.linalg.inv(cond_precision) - logdens_linear = cond_cov.dot(opt_linear.T) * prec - else: - cond_precision = opt_linear.T.dot(prec.dot(opt_linear)) - cond_cov = np.linalg.inv(cond_precision) - logdens_linear = cond_cov.dot(opt_linear.T).dot(prec) + solve_args : dict, optional + Arguments passed to solver. - cond_mean = -logdens_linear.dot(self.observed_score_state + opt_offset) + """ - return cond_mean, cond_cov, cond_precision, logdens_linear + G = approximate_grid_inference(self, + observed_target, + target_cov, + target_score_cov, + solve_args=solve_args) + return G.summary(alternatives=alternatives) class multiple_queries(object): From 2da4d06ea7b55d31c5a9a465955311d09c67d322 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 14:59:29 -0700 Subject: [PATCH 067/187] cleanup of approx reference code --- selectinf/randomized/approx_reference.py | 169 ++++++------------ .../randomized/tests/test_approx_reference.py | 4 +- 2 files changed, 59 insertions(+), 114 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index abee9bfbc..af8b936c8 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -74,6 +74,49 @@ def __init__(self, observed_target[j] + 1.5*_scale[j], num=ngrid) + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + + level : float + Confidence level. + + """ + + if parameter is not None: + pivots = self.approx_pivots(parameter, + alternatives=alternatives) + else: + pivots = None + + pvalues = self._approx_pivots(np.zeros_like(self.observed_target), + alternatives=alternatives) + lower, upper = self._approx_intervals(level=level) + + result = pd.DataFrame({'target':self.observed_target, + 'pvalue':pvalues, + 'lower_confidence':lower, + 'upper_confidence':upper}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result def _approx_log_reference(self, observed_target, @@ -115,47 +158,6 @@ def _approx_log_reference(self, return np.asarray(ref_hat) - def approx_CDF(self, - mean_parameter, - target_cov, - approx_log_ref, - grid): - - _approx_density = [] - for k in range(grid.shape[0]): - # approx_log_ref[k] = P(selection | D = N + Gamma * grid[k]) - _approx_density.append(np.exp(-np.true_divide((grid[k] - mean_parameter)**2, - 2 * target_cov) + approx_log_ref[k])) - - _approx_density_ = np.asarray(_approx_density) / (np.asarray(_approx_density).sum()) - return np.cumsum(_approx_density_) - - def approx_ci(self, - param_grid, - stat_grid, - target_cov, - approx_log_ref, - indx_obsv, - level): - - area = np.zeros(param_grid.shape[0]) - - for k in range(param_grid.shape[0]): - area_vec = self.approx_CDF(param_grid[k], - target_cov, - approx_log_ref, - stat_grid) - - area[k] = area_vec[indx_obsv] - - alpha = 1 - level - region = param_grid[(area >= alpha / 2.) & (area <= (1 - alpha / 2.))] - - if region.size > 0: - return np.nanmin(region), np.nanmax(region) - else: - return 0., 0. - def _construct_families(self): self._families = [] @@ -182,12 +184,14 @@ def _construct_families(self): 0.5 * (grid - self.observed_target[m])**2 / var_target) logW -= logW.max() + # construction of families follows `selectinf.learning.core` + self._families.append(discrete_family(grid, np.exp(logW))) - logG = - 0.5 * grid**2 / var_target - logG -= logG.max() - import matplotlib.pyplot as plt + # logG = - 0.5 * grid**2 / var_target + # logG -= logG.max() + # import matplotlib.pyplot as plt # plt.plot(self.stat_grid[m][10:30], approx_log_ref[10:30]) # plt.plot(self.stat_grid[m][:10], approx_log_ref[:10], 'r', linewidth=4) @@ -198,11 +202,9 @@ def _construct_families(self): # plt.plot(grid, logW) # plt.plot(grid, logG) - # stop - - def approx_pivots(self, - mean_parameter, - alternatives=None): + def _approx_pivots(self, + mean_parameter, + alternatives=None): if not hasattr(self, "_families"): self._construct_families() @@ -216,6 +218,9 @@ def approx_pivots(self, family = self._families[m] observed_target = self.observed_target[m] var_target = self.target_cov[m, m] + + # construction of pivot from families follows `selectinf.learning.core` + _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, x=observed_target) if alternatives[m] == 'twosided': @@ -228,8 +233,8 @@ def approx_pivots(self, raise ValueError('alternative should be in ["twosided", "less", "greater"]') return pivot - def approx_intervals(self, - level=0.9): + def _approx_intervals(self, + level=0.9): if not hasattr(self, "_families"): self._construct_families() @@ -237,6 +242,7 @@ def approx_intervals(self, lower, upper = [], [] for m in range(self.ntarget): + # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] observed_target = self.observed_target[m] l, u = family.equal_tailed_interval(observed_target, @@ -247,64 +253,3 @@ def approx_intervals(self, return np.asarray(lower), np.asarray(upper) - def summary(self, - alternatives=None, - parameter=None, - level=0.9): - """ - Produce p-values and confidence intervals for targets - of model including selected features - - Parameters - ---------- - - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - - level : float - Confidence level. - - """ - - if parameter is not None: - pivots = self.approx_pivots(parameter, - alternatives=alternatives) - else: - pivots = None - - pvalues = self.approx_pivots(np.zeros_like(self.observed_target), - alternatives=alternatives) - lower, upper = self.approx_intervals(level=level) - - result = pd.DataFrame({'target':self.observed_target, - 'pvalue':pvalues, - 'lower_confidence':lower, - 'upper_confidence':upper}) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result - -def _log_concave_approx(xval, yval): - """ - Approximate a log-concave function - to full line based on sample. - - Assumes `xval` is sorted - """ - - nu, nl = 10, 10 - n = xval.shape[0] - D = np.vstack([np.ones(n), xval, xval**2]).T - - Du = D[-nu:] - Qu = np.linalg(Du).dot(yval[-nu:]) - - Dl = D[:nl] - Ql = np.linalg(Dl).dot(yval[:nl]) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index fb1d94828..fbf57dd13 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -110,7 +110,7 @@ def test_approx_pivot(n=500, cov_target, cov_target_score) - pivot = approximate_grid_inf.approx_pivots(beta_target) + pivot = approximate_grid_inf._approx_pivots(beta_target) return pivot @@ -175,7 +175,7 @@ def test_approx_ci(n=500, cov_target, cov_target_score) - lci, uci = approximate_grid_inf.approx_intervals(level) + lci, uci = approximate_grid_inf._approx_intervals(level) S = conv.approximate_grid_inference(observed_target, cov_target, From ae7c0de630f8a92ab807427a47c751168ea4dc3e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 8 Jul 2020 15:02:15 -0700 Subject: [PATCH 068/187] BF: remove statsmodells dependency --- selectinf/randomized/tests/test_posterior.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 2b93b0422..c9e3fc118 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -import statsmodels.api as sm from scipy.stats import norm as ndist from ...tests.instance import gaussian_instance, HIV_NRTI @@ -244,8 +243,8 @@ def test_hiv_data(nsample=10000, n, p = X.shape X /= np.sqrt(n) - ols_fit = sm.OLS(Y, X).fit() - _sigma = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) + ols_fit = np.linalg.pinv(X).dot(Y) + _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) const = split_lasso.gaussian From 07cfc9e0c9ae4847d3c3ecb21c6e9af2ed25f5f6 Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Sat, 11 Jul 2020 18:34:20 -0400 Subject: [PATCH 069/187] commit before switch --- selectinf/randomized/tests/test_approx_reference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index fbf57dd13..1832b7cbe 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -221,7 +221,7 @@ def main(nsim=300, CI = False): p=100, signal_fac=1., s=5, - sigma=2., + sigma=3., rho=0.4, randomizer_scale=1.) @@ -232,4 +232,4 @@ def main(nsim=300, CI = False): print("iteration completed ", n + 1) if __name__ == "__main__": - main(nsim=20, CI = True) + main(nsim=40, CI = False) From 69713fd3f9bff88daadd8eb9125daf08a700cf5b Mon Sep 17 00:00:00 2001 From: snigdhagit Date: Mon, 13 Jul 2020 19:17:40 -0400 Subject: [PATCH 070/187] test bias --- selectinf/randomized/query.py | 6 +- .../tests/test_selective_MLE_high.py | 75 ++++++----- selectinf/randomized/tests/test_topK.py | 124 ++++++++++++++---- 3 files changed, 146 insertions(+), 59 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index df890a2ef..b46aab9a7 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1681,11 +1681,13 @@ def selective_MLE(observed_target, conjugate_arg = prec_opt.dot(cond_mean) + useC= False + print("useC", useC) if useC: solver = solve_barrier_affine_C else: solver = _solve_barrier_affine_py - + val, soln, hess = solver(conjugate_arg, prec_opt, init_soln, @@ -1696,6 +1698,8 @@ def selective_MLE(observed_target, final_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) ind_unbiased_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - init_soln))) + + print("check within MLE ", soln, init_soln) L = target_lin.T.dot(prec_opt) observed_info_natural = prec_target + L.dot(target_lin) - L.dot(hess.dot(L.T)) observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 578ae66ec..e38bde4fa 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -83,7 +83,7 @@ def test_full_targets(n=200, def test_selected_targets(n=2000, p=200, - signal_fac=10., + signal_fac=1., s=5, sigma=3, rho=0.4, @@ -147,43 +147,18 @@ def test_selected_targets(n=2000, beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals - -def main(nsim=500, full=False): - P0, PA, cover, length_int = [], [], [], [] - from statsmodels.distributions import ECDF - - n, p, s = 500, 100, 5 - - for i in range(nsim): - if full: - if n > p: - full_dispersion = True - else: - full_dispersion = False - p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - else: - full_dispersion = True - p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, - full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - print( - np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), - np.mean(avg_length), 'null pvalue + power + length') + print("observed_opt_state ", conv.observed_opt_state) + # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals def test_instance(): n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) - #beta[:s] = np.sqrt(2 * np.log(p) / n) + beta[:s] = np.sqrt(2 * np.log(p) / n) Y = X.dot(beta) + np.random.standard_normal(n) scale_ = np.std(Y) @@ -215,17 +190,47 @@ def test_instance(): beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + print("observed_opt_state ", L.observed_opt_state) + #print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) return coverage -def main(nsim=500): +# def main(nsim=500): +# +# cover = [] +# for i in range(nsim): +# +# cover_ = test_instance() +# cover.extend(cover_) +# print(np.mean(cover), 'coverage so far ') + +def main(nsim=500, full=False): + P0, PA, cover, length_int = [], [], [], [] + from statsmodels.distributions import ECDF + + n, p, s = 500, 100, 5 - cover = [] for i in range(nsim): + if full: + if n > p: + full_dispersion = True + else: + full_dispersion = False + p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) + avg_length = intervals[:, 1] - intervals[:, 0] + else: + full_dispersion = True + p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, + full_dispersion=full_dispersion) + avg_length = intervals[:, 1] - intervals[:, 0] - cover_ = test_instance() cover.extend(cover_) - print(np.mean(cover), 'coverage so far ') + P0.extend(p0) + PA.extend(pA) + print( + np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), + np.mean(avg_length), 'null pvalue + power + length') + if __name__ == "__main__": - main(nsim=500) + main(nsim=100) diff --git a/selectinf/randomized/tests/test_topK.py b/selectinf/randomized/tests/test_topK.py index 000c45aba..8091f8ac3 100644 --- a/selectinf/randomized/tests/test_topK.py +++ b/selectinf/randomized/tests/test_topK.py @@ -10,7 +10,7 @@ def test_topK(n=500, s=5, sigma=3, rho=0.4, - randomizer_scale=0.25, + randomizer_scale=0.50, use_MLE=True, marginal=False): @@ -85,29 +85,107 @@ def test_both(): test_topK(marginal=True) test_topK(marginal=False) -def main(nsim=5000, use_MLE=False): +def test_bias_topK(n=500, + p=50, + s=5, + sigma=3, + rho=0.4, + randomizer_scale=0.50, + K=5, + marginal=False): - import matplotlib.pyplot as plt - import statsmodels.api as sm - U = np.linspace(0, 1, 101) + while True: + X = gaussian_instance(n=n, + p=p, + equicorrelated=False, + rho=rho)[0] + W = rho**(np.fabs(np.subtract.outer(np.arange(p), np.arange(p)))) + sqrtW = np.linalg.cholesky(W) + sigma = 0.15 + Z = np.random.standard_normal(p).dot(sqrtW.T) * sigma + beta = (2 * np.random.binomial(1, 0.5, size=(p,)) - 1) * 5 * sigma + beta[s:] = 0 + np.random.shuffle(beta) + + true_mean = W.dot(beta) + score = Z + true_mean + idx = np.arange(p) + + n, p = X.shape + + randomizer = randomization.isotropic_gaussian(p, randomizer_scale * sigma) + topK_select = topK(score, + W * sigma**2, + randomizer, + K) + + boundary = topK_select.fit() + nonzero = boundary != 0 + + if nonzero.sum() > 0: + + if marginal: + beta_target = true_mean[nonzero] + (observed_target, + cov_target, + crosscov_target_score, + alternatives) = topK_select.marginal_targets(nonzero) + else: + beta_target = beta[nonzero] + (observed_target, + cov_target, + crosscov_target_score, + alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) + + result = topK_select.selective_MLE(observed_target, + cov_target, + crosscov_target_score)[0] + + bias_mle = np.asarray(result['MLE'])-beta_target + bias_indest = np.asarray(result['unbiased'])-beta_target + print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) + + return bias_mle, bias_indest + + +# def main(nsim=5000, use_MLE=False): +# +# import matplotlib.pyplot as plt +# import statsmodels.api as sm +# U = np.linspace(0, 1, 101) +# +# P0, PA, cover, length_int = [], [], [], [] +# for i in range(nsim): +# p0, pA, cover_, intervals = test_topK(use_MLE=use_MLE) +# +# cover.extend(cover_) +# P0.extend(p0) +# PA.extend(pA) +# print(np.mean(cover),'coverage so far') +# +# period = 10 +# if use_MLE: +# period = 50 +# if i % period == 0 and i > 0: +# plt.clf() +# plt.plot(U, sm.distributions.ECDF(P0)(U), 'b', label='null') +# plt.plot(U, sm.distributions.ECDF(PA)(U), 'r', label='alt') +# plt.plot([0, 1], [0, 1], 'k--') +# plt.legend() +# plt.savefig('topK_pvals.pdf') + + +def main(nsim=500): + _bias_mle = [] + _bias_indest = [] - P0, PA, cover, length_int = [], [], [], [] for i in range(nsim): - p0, pA, cover_, intervals = test_topK(use_MLE=use_MLE) - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - print(np.mean(cover),'coverage so far') - - period = 10 - if use_MLE: - period = 50 - if i % period == 0 and i > 0: - plt.clf() - plt.plot(U, sm.distributions.ECDF(P0)(U), 'b', label='null') - plt.plot(U, sm.distributions.ECDF(PA)(U), 'r', label='alt') - plt.plot([0, 1], [0, 1], 'k--') - plt.legend() - plt.savefig('topK_pvals.pdf') + bias_mle, bias_indest = test_bias_topK() + _bias_mle.extend(bias_mle) + _bias_indest.extend(bias_indest) + + print(np.mean(_bias_mle), np.mean(_bias_indest), 'bias so far: mle and independent estimate ') + +if __name__ == "__main__": + main(nsim=500) From 54478ed23b24df31f801d5deb5cdecf85b986883 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sat, 9 Jan 2021 13:31:51 -0500 Subject: [PATCH 071/187] commit before switch --- selectinf/randomized/query.py | 1 - 1 file changed, 1 deletion(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index b46aab9a7..aa1cbd8a6 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1699,7 +1699,6 @@ def selective_MLE(observed_target, ind_unbiased_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - init_soln))) - print("check within MLE ", soln, init_soln) L = target_lin.T.dot(prec_opt) observed_info_natural = prec_target + L.dot(target_lin) - L.dot(hess.dot(L.T)) observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) From e45a42e54961823b9d842c31c8ce1c34d2122d49 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Wed, 3 Feb 2021 13:56:21 -0500 Subject: [PATCH 072/187] MCMC free pivots for group lasso --- .../randomized/approx_reference_grouplasso.py | 776 ++++++++++++++++++ .../tests/test_approx_reference_grouplasso.py | 88 ++ selectinf/tests/instance.py | 94 +++ 3 files changed, 958 insertions(+) create mode 100644 selectinf/randomized/approx_reference_grouplasso.py create mode 100644 selectinf/randomized/tests/test_approx_reference_grouplasso.py diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py new file mode 100644 index 000000000..f028fcbe3 --- /dev/null +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -0,0 +1,776 @@ +from __future__ import print_function +from scipy.linalg import block_diag +from scipy.stats import norm as ndist +from scipy.interpolate import interp1d + +import collections +import numpy as np +from numpy import log +from numpy.linalg import norm, qr, inv, eig +import pandas as pd + +import regreg.api as rr +from .randomization import randomization +from ..base import restricted_estimator +from .query import _solve_barrier_affine_py +from ..distributions.discrete_family import discrete_family + +class group_lasso(object): + + def __init__(self, + loglike, + groups, + weights, + ridge_term, + randomizer, + use_lasso=True, # should lasso solver be used where applicable - defaults to True + perturb=None): + + _check_groups(groups) # make sure groups looks sensible + + # log likelihood : quadratic loss + self.loglike = loglike + self.nfeature = self.loglike.shape[0] + + # ridge parameter + self.ridge_term = ridge_term + + # group lasso penalty (from regreg) + # use regular lasso penalty if all groups are size 1 + if use_lasso and groups.size == np.unique(groups).size: + # need to provide weights an an np.array rather than a dictionary + weights_np = np.array([w[1] for w in sorted(weights.items())]) + self.penalty = rr.weighted_l1norm(weights=weights_np, + lagrange=1.) + else: + self.penalty = rr.group_lasso(groups, + weights=weights, + lagrange=1.) + + # store groups as a class variable since the non-group lasso doesn't + self.groups = groups + + self._initial_omega = perturb + + # gaussian randomization + self.randomizer = randomizer + + def fit(self, + solve_args={'tol': 1.e-12, 'min_its': 50}, + perturb=None): + + # solve the randomized version of group lasso + (self.initial_soln, + self.initial_subgrad) = self._solve_randomized_problem(perturb=perturb, + solve_args=solve_args) + + # initialize variables + active_groups = [] # active group labels + active_dirs = {} # dictionary: keys are group labels, values are unit-norm coefficients + unpenalized = [] # selected groups with no penalty + overall = np.ones(self.nfeature, np.bool) # mask of active features + ordered_groups = [] # active group labels sorted by label + ordered_opt = [] # gamma's ordered by group labels + ordered_vars = [] # indices "ordered" by sorting group labels + + tol = 1.e-20 + + # now we are collecting the directions and norms of the active groups + for g in sorted(np.unique(self.groups)): # g is group label + + group_mask = self.groups == g + soln = self.initial_soln # do not need to keep setting this + + if norm(soln[group_mask]) > tol * norm(soln): # is group g appreciably nonzero + ordered_groups.append(g) + + # variables in active group + ordered_vars.extend(np.flatnonzero(group_mask)) + + if self.penalty.weights[g] == 0: + unpenalized.append(g) + + else: + active_groups.append(g) + active_dirs[g] = soln[group_mask] / norm(soln[group_mask]) + + ordered_opt.append(norm(soln[group_mask])) + else: + overall[group_mask] = False + + self.selection_variable = {'directions': active_dirs, + 'active_groups': active_groups} # kind of redundant with keys of active_dirs + + self._ordered_groups = ordered_groups + + # exception if no groups are selected + if len(self.selection_variable['active_groups']) == 0: + return np.sign(soln), soln + + # otherwise continue as before + self.observed_opt_state = np.hstack(ordered_opt) # gammas as array + + _beta_unpenalized = restricted_estimator(self.loglike, # refit OLS on E + overall, + solve_args=solve_args) + + beta_bar = np.zeros(self.nfeature) + beta_bar[overall] = _beta_unpenalized # refit OLS beta with zeros + self._beta_full = beta_bar + + X, y = self.loglike.data + W = self._W = self.loglike.saturated_loss.hessian(X.dot(beta_bar)) # all 1's for LS + opt_linearNoU = np.dot(X.T, X[:, ordered_vars] * W[:, np.newaxis]) + + for i, var in enumerate(ordered_vars): + opt_linearNoU[var, i] += self.ridge_term + + opt_offset = self.initial_subgrad + + self.observed_score_state = -opt_linearNoU.dot(_beta_unpenalized) + self.observed_score_state[~overall] += self.loglike.smooth_objective(beta_bar, 'grad')[~overall] + + active_signs = np.sign(self.initial_soln) + active = np.flatnonzero(active_signs) + self.active = active + + def compute_Vg(ug): + pg = ug.size # figure out size of g'th group + if pg > 1: + Z = np.column_stack((ug, np.eye(pg, pg - 1))) + Q, _ = qr(Z) + Vg = Q[:, 1:] # drop the first column + else: + Vg = np.zeros((1, 0)) # if the group is size one, the orthogonal complement is empty + return Vg + + def compute_Lg(g): + pg = active_dirs[g].size + Lg = self.penalty.weights[g] * np.eye(pg) + return Lg + + sorted_active_dirs = collections.OrderedDict(sorted(active_dirs.items())) + + Vs = [compute_Vg(ug) for ug in sorted_active_dirs.values()] + V = block_diag(*Vs) # unpack the list + Ls = [compute_Lg(g) for g in sorted_active_dirs] + L = block_diag(*Ls) # unpack the list + XE = X[:, ordered_vars] # changed to ordered_vars + Q = XE.T.dot(self._W[:, None] * XE) + QI = inv(Q) + C = V.T.dot(QI).dot(L).dot(V) + + self.XE = XE + self.Q = Q + self.QI = QI + self.C = C + + U = block_diag(*[ug for ug in sorted_active_dirs.values()]).T + + self.opt_linear = opt_linearNoU.dot(U) + self.active_dirs = active_dirs + self.opt_offset = opt_offset + self.ordered_vars = ordered_vars + + self.linear_part = -np.eye(self.observed_opt_state.shape[0]) + self.offset = np.zeros(self.observed_opt_state.shape[0]) + + # print("K.K.T. map", np.allclose(self._initial_omega, self.observed_score_state + self.opt_linear.dot(self.observed_opt_state) + # + self.opt_offset, rtol=1e-03)) + return active_signs, soln + + def _solve_randomized_problem(self, + perturb=None, + solve_args={'tol': 1.e-15, 'min_its': 100}): + + # take a new perturbation if supplied + if perturb is not None: + self._initial_omega = perturb + if self._initial_omega is None: + self._initial_omega = self.randomizer.sample() + + quad = rr.identity_quadratic(self.ridge_term, + 0, + -self._initial_omega, + 0) + + problem = rr.simple_problem(self.loglike, self.penalty) + + # if all groups are size 1, set up lasso penalty and run usual lasso solver... (see existing code)... + + initial_soln = problem.solve(quad, **solve_args) + initial_subgrad = -(self.loglike.smooth_objective(initial_soln, + 'grad') + + quad.objective(initial_soln, 'grad')) + + return initial_soln, initial_subgrad + + @staticmethod + def gaussian(X, + Y, + groups, + weights, + sigma=1., + quadratic=None, + ridge_term=0., + perturb=None, + use_lasso=True, # should lasso solver be used when applicable - defaults to True + randomizer_scale=None): + + loglike = rr.glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic) + n, p = X.shape + + mean_diag = np.mean((X ** 2).sum(0)) + if ridge_term is None: + ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1) + + if randomizer_scale is None: + randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.)) + + randomizer = randomization.isotropic_gaussian((p,), randomizer_scale) + + return group_lasso(loglike, + groups, + weights, + ridge_term, + randomizer, + use_lasso, + perturb) + + def _setup_implied_gaussian(self): + + _, prec = self.randomizer.cov_prec + + if np.asarray(prec).shape in [(), (0,)]: + cond_precision = self.opt_linear.T.dot(self.opt_linear) * prec + cond_cov = inv(cond_precision) + logdens_linear = cond_cov.dot(self.opt_linear.T) * prec + else: + cond_precision = self.opt_linear.T.dot(prec.dot(self.opt_linear)) + cond_cov = inv(cond_precision) + logdens_linear = cond_cov.dot(self.opt_linear.T).dot(prec) + + cond_mean = -logdens_linear.dot(self.observed_score_state + self.opt_offset) + self.cond_mean = cond_mean + self.cond_cov = cond_cov + self.cond_precision = cond_precision + self.logdens_linear = logdens_linear + + return cond_mean, cond_cov, cond_precision, logdens_linear + + def selective_MLE(self, + solve_args={'tol': 1.e-12}, + level=0.9, + useJacobian=True, + dispersion=None): + + """Do selective_MLE for group_lasso + Note: this masks the selective_MLE inherited from query + because that is not adapted for the group_lasso. Also, assumes + you have already run the fit method since this uses results + from that method. + Parameters + ---------- + observed_target: from selected_targets + target_cov: from selected_targets + target_cov_score: from selected_targets + init_soln: (opt_state) initial (observed) value of optimization variables + cond_mean: conditional mean of optimization variables (model on _setup_implied_gaussian) + cond_cov: conditional variance of optimization variables (model on _setup_implied_gaussian) + logdens_linear: (model on _setup_implied_gaussian) + linear_part: like A_scaling (from lasso) + offset: like b_scaling (from lasso) + solve_args: passed on to solver + level: level of confidence intervals + useC: whether to use python or C solver + JacobianPieces: (use self.C defined in fitting) + """ + + self._setup_implied_gaussian() # Calculate useful quantities + (observed_target, target_cov, target_score_cov, alternatives) = self.selected_targets(dispersion) + + init_soln = self.observed_opt_state # just the gammas + cond_mean = self.cond_mean + cond_cov = self.cond_cov + logdens_linear = self.logdens_linear + linear_part = self.linear_part + offset = self.offset + + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + observed_target = np.atleast_1d(observed_target) + prec_target = inv(target_cov) + + # target_lin determines how the conditional mean of optimization variables + # vary with target + # logdens_linear determines how the argument of the optimization density + # depends on the score, not how the mean depends on score, hence the minus sign + + target_lin = - logdens_linear.dot(target_score_cov.T.dot(prec_target)) + target_offset = cond_mean - target_lin.dot(observed_target) + + prec_opt = self.cond_precision + + conjugate_arg = prec_opt.dot(cond_mean) + + val, soln, hess = solve_barrier_affine_jacobian_py(conjugate_arg, + prec_opt, + init_soln, + linear_part, + offset, + self.C, + self.active_dirs, + useJacobian, + **solve_args) + + log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. + + final_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) + ind_unbiased_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean + - init_soln))) + L = target_lin.T.dot(prec_opt) + observed_info_natural = prec_target + L.dot(target_lin) - L.dot(hess.dot(L.T)) + observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) + + Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) + pvalues = ndist.cdf(Z_scores) + pvalues = 2 * np.minimum(pvalues, 1 - pvalues) + + alpha = 1. - level + quantile = ndist.ppf(1 - alpha / 2.) + intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), + final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T + + result = pd.DataFrame({'MLE': final_estimator, + 'SE': np.sqrt(np.diag(observed_info_mean)), + 'Zvalue': Z_scores, + 'pvalue': pvalues, + 'lower_confidence': intervals[:, 0], + 'upper_confidence': intervals[:, 1], + 'unbiased': ind_unbiased_estimator}) + + return result, observed_info_mean, log_ref + + def selected_targets(self, + dispersion=None, + solve_args={'tol': 1.e-12, 'min_its': 50}): + + X, y = self.loglike.data + n, p = X.shape + + XE = self.XE + Q = self.Q + observed_target = restricted_estimator(self.loglike, self.ordered_vars, solve_args=solve_args) + _score_linear = -XE.T.dot(self._W[:, None] * X).T + alternatives = ['twosided'] * len(self.active) + + if dispersion is None: # use Pearson's X^2 + dispersion = ((y - self.loglike.saturated_loss.mean_function( + XE.dot(observed_target))) ** 2 / self._W).sum() / (n - XE.shape[1]) + + cov_target = self.QI * dispersion + crosscov_target_score = _score_linear.dot(self.QI).T * dispersion + + return (observed_target, + cov_target, + crosscov_target_score, + alternatives) + + +class approximate_grid_inference(object): + + def __init__(self, + query, + dispersion, + solve_args={'tol': 1.e-12}): + + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + query : `gaussian_query` + A Gaussian query which has information + to describe implied Gaussian. + observed_target : ndarray + Observed estimate of target. + target_cov : ndarray + Estimated covaraince of target. + target_score_cov : ndarray + Estimated covariance of target and score of randomized query. + solve_args : dict, optional + Arguments passed to solver. + """ + + self.solve_args = solve_args + + result, inverse_info = query.selective_MLE(dispersion=dispersion)[:2] + + (observed_target, target_cov, target_score_cov, alternatives) = query.selected_targets(dispersion) + + self.observed_target = observed_target + self.target_score_cov = target_score_cov + self.target_cov = target_cov + + self.linear_part = query.linear_part + self.offset = query.offset + + self.logdens_linear = query.logdens_linear + self.cond_mean = query.cond_mean + self.prec_opt = np.linalg.inv(query.cond_cov) + self.cond_cov = query.cond_cov + self.C = query.C + self.active_dirs = query.active_dirs + + self.init_soln = query.observed_opt_state + + self.ntarget = ntarget = target_cov.shape[0] + _scale = 4 * np.sqrt(np.diag(inverse_info)) + ngrid = 40 + + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + level : float + Confidence level. + """ + + if parameter is not None: + pivots = self.approx_pivots(parameter, + alternatives=alternatives) + else: + pivots = None + + pvalues = self._approx_pivots(np.zeros_like(self.observed_target), + alternatives=alternatives) + lower, upper = self._approx_intervals(level=level) + + result = pd.DataFrame({'target': self.observed_target, + 'pvalue': pvalues, + 'lower_confidence': lower, + 'upper_confidence': upper}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result + + def _approx_log_reference(self, + observed_target, + target_cov, + target_score_cov, + grid): + + """ + Approximate the log of the reference density on a grid. + """ + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + prec_target = np.linalg.inv(target_cov) + target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) + + ref_hat = [] + solver = _solve_barrier_affine_py + + for k in range(grid.shape[0]): + + cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + self.cond_mean) + conjugate_arg = self.prec_opt.dot(cond_mean_grid) + + val, soln, _ = solver(conjugate_arg, + self.prec_opt, + self.init_soln, + self.linear_part, + self.offset, + **self.solve_args) + + log_jacob = jacobian_grad_hess(soln, self.C, self.active_dirs) + + ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.) + log_jacob[0]) + + return np.asarray(ref_hat) + + def _construct_families(self): + + self._families = [] + for m in range(self.ntarget): + p = self.target_score_cov.shape[1] + observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + var_target = target_cov_uni[0, 0] + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + + approx_log_ref = self._approx_log_reference(observed_target_uni, + target_cov_uni, + target_score_cov_uni, + self.stat_grid[m]) + + approx_fn = interp1d(self.stat_grid[m], + approx_log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') + + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + + # construction of families follows `selectinf.learning.core` + + self._families.append(discrete_family(grid, + np.exp(logW))) + + + def _approx_pivots(self, + mean_parameter, + alternatives=None): + + if not hasattr(self, "_families"): + self._construct_families() + + if alternatives is None: + alternatives = ['twosided'] * self.ntarget + + pivot = [] + + for m in range(self.ntarget): + print("variable computed ", m) + family = self._families[m] + observed_target = self.observed_target[m] + var_target = self.target_cov[m, m] + + # construction of pivot from families follows `selectinf.learning.core` + + _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, + x=observed_target) + if alternatives[m] == 'twosided': + pivot.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pivot.append(1 - _cdf) + elif alternatives[m] == 'less': + pivot.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') + return pivot + + def _approx_intervals(self, + level=0.9): + + if not hasattr(self, "_families"): + self._construct_families() + + lower, upper = [], [] + + for m in range(self.ntarget): + # construction of intervals from families follows `selectinf.learning.core` + family = self._families[m] + observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, + alpha=1 - level) + var_target = self.target_cov[m, m] + lower.append(l * var_target + observed_target) + upper.append(u * var_target + observed_target) + + return np.asarray(lower), np.asarray(upper) + + +def solve_barrier_affine_jacobian_py(conjugate_arg, + precision, + feasible_point, + con_linear, + con_offset, + C, + active_dirs, + useJacobian=True, + step=1, + nstep=2000, + min_its=500, + tol=1.e-12): + """ + This needs to be updated to actually use the Jacobian information (in self.C) + arguments + conjugate_arg: \\bar{\\Sigma}^{-1} \bar{\\mu} + precision: \\bar{\\Sigma}^{-1} + feasible_point: gamma's from fitting + con_linear: linear part of affine constraint used for barrier function + con_offset: offset part of affine constraint used for barrier function + C: V^T Q^{-1} \\Lambda V + active_dirs: + """ + scaling = np.sqrt(np.diag(con_linear.dot(precision).dot(con_linear.T))) + + if feasible_point is None: + feasible_point = 1. / scaling + + def objective(gs): + p1 = -gs.T.dot(conjugate_arg) + p2 = gs.T.dot(precision).dot(gs) / 2. + if useJacobian: + p3 = - jacobian_grad_hess(gs, C, active_dirs)[0] + else: + p3 = 0 + p4 = log(1. + 1. / ((con_offset - con_linear.dot(gs)) / scaling)).sum() + return p1 + p2 + p3 + p4 + + def grad(gs): + p1 = -conjugate_arg + precision.dot(gs) + p2 = -con_linear.T.dot(1. / (scaling + con_offset - con_linear.dot(gs))) + if useJacobian: + p3 = - jacobian_grad_hess(gs, C, active_dirs)[1] + else: + p3 = 0 + p4 = 1. / (con_offset - con_linear.dot(gs)) + return p1 + p2 + p3 + p4 + + def barrier_hessian(gs): # contribution of barrier and jacobian to hessian + p1 = con_linear.T.dot(np.diag(-1. / ((scaling + con_offset - con_linear.dot(gs)) ** 2.) + + 1. / ((con_offset - con_linear.dot(gs)) ** 2.))).dot(con_linear) + if useJacobian: + p2 = - jacobian_grad_hess(gs, C, active_dirs)[2] + else: + p2 = 0 + return p1 + p2 + + current = feasible_point + current_value = np.inf + + for itercount in range(nstep): + cur_grad = grad(current) + + # make sure proposal is feasible + + count = 0 + while True: + count += 1 + proposal = current - step * cur_grad + if np.all(con_offset - con_linear.dot(proposal) > 0): + break + step *= 0.5 + if count >= 40: + raise ValueError('not finding a feasible point') + # make sure proposal is a descent + + count = 0 + while True: + count += 1 + proposal = current - step * cur_grad + proposed_value = objective(proposal) + if proposed_value <= current_value: + break + step *= 0.5 + if count >= 20: + if not (np.isnan(proposed_value) or np.isnan(current_value)): + break + else: + raise ValueError('value is NaN: %f, %f' % (proposed_value, current_value)) + + # stop if relative decrease is small + + if np.fabs(current_value - proposed_value) < tol * np.fabs(current_value) and itercount >= min_its: + current = proposal + current_value = proposed_value + break + + current = proposal + current_value = proposed_value + + if itercount % 4 == 0: + step *= 2 + + hess = inv(precision + barrier_hessian(current)) + return current_value, current, hess + + +# Jacobian calculations +def calc_GammaMinus(gamma, active_dirs): + """Calculate Gamma^minus (as a function of gamma vector, active directions) + """ + to_diag = [[g] * (ug.size - 1) for (g, ug) in zip(gamma, active_dirs.values())] + return block_diag(*[i for gp in to_diag for i in gp]) + + +def jacobian_grad_hess(gamma, C, active_dirs): + """ Calculate the log-Jacobian (scalar), gradient (gamma.size vector) and hessian (gamma.size square matrix) + """ + if C.shape == (0, 0): # when all groups are size one, C will be an empty array + return 0, 0, 0 + else: + GammaMinus = calc_GammaMinus(gamma, active_dirs) + + # eigendecomposition + evalues, evectors = eig(GammaMinus + C) + + # log Jacobian + J = log(evalues).sum() + + # inverse + GpC_inv = evectors.dot(np.diag(1 / evalues).dot(evectors.T)) + + # summing matrix (gamma.size by C.shape[0]) + S = block_diag(*[np.ones((1, ug.size - 1)) for ug in active_dirs.values()]) + + # gradient + grad_J = S.dot(GpC_inv.diagonal()) + + # hessian + hess_J = -S.dot(np.multiply(GpC_inv, GpC_inv.T).dot(S.T)) + + return J, grad_J, hess_J + +def _check_groups(groups): + """Make sure that the user-specific groups are ok + There are a number of assumptions that group_lasso makes about + how groups are specified. Specifically, we assume that + `groups` is a 1-d array_like of integers that are sorted in + increasing order, start at 0, and have no gaps (e.g., if there + is a group 2 and a group 4, there must also be at least one + feature in group 3). + This function checks the user-specified group scheme and + raises an exception if it finds any problems. + Sorting feature groups is potentially tedious for the user and + in future we might do this for them. + """ + + # check array_like + agroups = np.array(groups) + + # check dimension + if len(agroups.shape) != 1: + raise ValueError("Groups are not a 1D array_like") + + # check sorted + if np.any(agroups[:-1] > agroups[1:]) < 0: + raise ValueError("Groups are not sorted") + + # check integers + if not np.issubdtype(agroups.dtype, np.integer): + raise TypeError("Groups are not integers") + + # check starts with 0 + if not np.amin(agroups) == 0: + raise ValueError("First group is not 0") + + # check for no skipped groups + if not np.all(np.diff(np.unique(agroups)) == 1): + raise ValueError("Some group is skipped") diff --git a/selectinf/randomized/tests/test_approx_reference_grouplasso.py b/selectinf/randomized/tests/test_approx_reference_grouplasso.py new file mode 100644 index 000000000..0f4d64539 --- /dev/null +++ b/selectinf/randomized/tests/test_approx_reference_grouplasso.py @@ -0,0 +1,88 @@ +import numpy as np + +from ...tests.instance import gaussian_instance, gaussian_group_instance +from ..approx_reference_grouplasso import group_lasso, approximate_grid_inference + +def test_approx_pivot(n=500, + p=200, + signal_fac=0.1, + sgroup=3, + groups=np.arange(50).repeat(4), + sigma=3., + rho=0.3, + randomizer_scale=1, + weight_frac=1.2): + + inst, const = gaussian_group_instance, group_lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + sgroup=sgroup, + groups=groups, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + penalty_weights = dict([(i, weight_frac * sigma_ * np.sqrt(2 * np.log(p))) for i in np.unique(groups)]) + + conv = const(X, + Y, + groups, + penalty_weights, + randomizer_scale=randomizer_scale * dispersion) + + signs, _ = conv.fit() + nonzero = signs != 0 + print("number of selected variables ", nonzero.sum()) + + if nonzero.sum()>0: + + conv._setup_implied_gaussian() + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + approximate_grid_inf = approximate_grid_inference(conv, + dispersion) + + pivot = approximate_grid_inf._approx_pivots(beta_target) + + return pivot + + +def main(nsim=300, CI = False): + + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF + if CI is False: + _pivot = [] + for i in range(nsim): + _pivot.extend(test_approx_pivot(n=500, + p=100, + signal_fac=0.3, + sgroup=3, + groups=np.arange(20).repeat(5), + sigma=1., + rho=0.20, + randomizer_scale=0.5, + weight_frac=1.)) + + print("iteration completed ", i) + + plt.clf() + ecdf_MLE = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + +if __name__ == "__main__": + + main(nsim=50, CI = False) diff --git a/selectinf/tests/instance.py b/selectinf/tests/instance.py index 15826a148..9a75a8ded 100644 --- a/selectinf/tests/instance.py +++ b/selectinf/tests/instance.py @@ -373,3 +373,97 @@ def HIV_NRTI(drug='3TC', Y -= Y.mean() X_NRTI -= X_NRTI.mean(0)[None, :]; X_NRTI /= X_NRTI.std(0)[None,:] return X_NRTI, Y, np.array(NRTI_muts) + + +def gaussian_group_instance(n=100, p=200, sgroup=7, sigma=5, rho=0., signal=7, + random_signs=False, df=np.inf, + scale=True, center=True, + groups=np.arange(20).repeat(10), + equicorrelated=True): + """A testing instance for the group LASSO. + If equicorrelated is True design is equi-correlated in the population, + normalized to have columns of norm 1. + If equicorrelated is False design is auto-regressive. + For the default settings, a $\\lambda$ of around 13.5 + corresponds to the theoretical $E(\\|X^T\\epsilon\\|_{\\infty})$ + with $\\epsilon \\sim N(0, \\sigma^2 I)$. + Parameters + ---------- + n : int + Sample size + p : int + Number of features + sgroup : int + True sparsity (number of active groups) + groups : array_like (1d, size == p) + Assignment of features to (non-overlapping) groups + sigma : float + Noise level + rho : float + Equicorrelation value (must be in interval [0,1]) + signal : float or (float, float) + Sizes for the coefficients. If a tuple -- then coefficients + are equally spaced between these values using np.linspace. + Note: the size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). + If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. + random_signs : bool + If true, assign random signs to coefficients. + Else they are all positive. + df : int + Degrees of freedom for noise (from T distribution). + equicorrelated: bool + If true, design in equi-correlated, + Else design is AR. + Returns + ------- + X : np.float((n,p)) + Design matrix. + y : np.float(n) + Response vector. + beta : np.float(p) + True coefficients. + active : np.int(s) + Non-zero pattern. + sigma : float + Noise level. + sigmaX : np.ndarray((p,p)) + Row covariance. + """ + + X, sigmaX = _design(n, p, rho, equicorrelated)[:2] + + if center: + X -= X.mean(0)[None, :] + + beta = np.zeros(p) + signal = np.atleast_1d(signal) + + group_labels = np.unique(groups) + group_active = np.random.choice(group_labels, sgroup, replace=False) + + active = np.isin(groups, group_active) + + if signal.shape == (1,): + beta[active] = signal[0] + else: + beta[active] = np.linspace(signal[0], signal[1], active.sum()) + if random_signs: + beta[active] *= (2 * np.random.binomial(1, 0.5, size=(active.sum(),)) - 1.) + beta /= np.sqrt(n) + + if scale: + scaling = X.std(0) * np.sqrt(n) + X /= scaling[None, :] + beta *= np.sqrt(n) + sigmaX = sigmaX / np.multiply.outer(scaling, scaling) + + # noise model + def _noise(n, df=np.inf): + if df == np.inf: + return np.random.standard_normal(n) + else: + sd_t = np.std(tdist.rvs(df, size=50000)) + return tdist.rvs(df, size=n) / sd_t + + Y = (X.dot(beta) + _noise(n, df)) * sigma + return X, Y, beta * sigma, np.nonzero(active)[0], sigma, sigmaX \ No newline at end of file From bc2107eb9f3846c0cc2c9d42951b880a0eb42037 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 25 Feb 2021 12:29:09 -0500 Subject: [PATCH 073/187] commit changes before switch --- selectinf/randomized/tests/test_approx_reference_grouplasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/randomized/tests/test_approx_reference_grouplasso.py b/selectinf/randomized/tests/test_approx_reference_grouplasso.py index 0f4d64539..0b4f53474 100644 --- a/selectinf/randomized/tests/test_approx_reference_grouplasso.py +++ b/selectinf/randomized/tests/test_approx_reference_grouplasso.py @@ -1,6 +1,6 @@ import numpy as np -from ...tests.instance import gaussian_instance, gaussian_group_instance +from ...tests.instance import gaussian_group_instance from ..approx_reference_grouplasso import group_lasso, approximate_grid_inference def test_approx_pivot(n=500, From c04d7a6f54a0fcc87cadf87f55e163ae1c3603f9 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 2 Mar 2021 15:43:28 -0800 Subject: [PATCH 074/187] BF: misnamed columns --- selectinf/randomized/tests/test_selective_MLE_high.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 578ae66ec..7ea737021 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -74,7 +74,9 @@ def test_full_targets(n=200, cov_target_score)[0] pval = result['pvalue'] estimate = result['MLE'] - intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + print("estimate, intervals", estimate, intervals) coverage = (beta[nonzero] > intervals[:, 0]) * (beta[nonzero] < intervals[:, 1]) @@ -142,7 +144,8 @@ def test_selected_targets(n=2000, cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -210,7 +213,8 @@ def test_instance(): cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) From fc9b0a31ebf91ddffa989b5b699e0bbf5f2c4ce7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 2 Mar 2021 15:44:56 -0800 Subject: [PATCH 075/187] BF: renaming of module --- doc/learning_examples/lasso_CV/lasso_example_CV.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/learning_examples/lasso_CV/lasso_example_CV.py b/doc/learning_examples/lasso_CV/lasso_example_CV.py index ad08a05a9..f0b6fa0f5 100644 --- a/doc/learning_examples/lasso_CV/lasso_example_CV.py +++ b/doc/learning_examples/lasso_CV/lasso_example_CV.py @@ -5,11 +5,11 @@ import regreg.api as rr -from selection.tests.instance import gaussian_instance +from selectinf.tests.instance import gaussian_instance -from selection.learning.utils import full_model_inference, pivot_plot -from selection.learning.core import split_sampler, probit_fit -from selection.learning.Rutils import lasso_glmnet +from selectinf.learning.utils import full_model_inference, pivot_plot +from selectinf.learning.core import split_sampler, probit_fit +from selectinf.learning.Rutils import lasso_glmnet def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1): From f9a19787b837cb4883ed00cafba19781e72c2f0a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 2 Mar 2021 15:50:59 -0800 Subject: [PATCH 076/187] fix some warnings about literal comparison --- selectinf/algorithms/tests/test_compareR.py | 2 +- selectinf/randomized/tests/test_lasso.py | 4 +--- selectinf/randomized/tests/test_slope.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/selectinf/algorithms/tests/test_compareR.py b/selectinf/algorithms/tests/test_compareR.py index 58ac797cb..81b01d877 100644 --- a/selectinf/algorithms/tests/test_compareR.py +++ b/selectinf/algorithms/tests/test_compareR.py @@ -875,7 +875,7 @@ def test_rlasso_gaussian(): random_signs=True) sigma_ = np.std(y) - if target is not 'debiased': + if target != 'debiased': lam = np.ones(X.shape[1]) * np.sqrt(1.5 * np.log(p)) * sigma_ else: lam = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 507a80d63..5d0f2bd63 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -41,7 +41,7 @@ def test_highdim_lasso(n=500, n, p = X.shape sigma_ = np.std(Y) - if target is not 'debiased': + if target != 'debiased': W = np.ones(X.shape[1]) * np.sqrt(1.5 * np.log(p)) * sigma_ else: W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ @@ -394,5 +394,3 @@ def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True): plt.show() -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/selectinf/randomized/tests/test_slope.py b/selectinf/randomized/tests/test_slope.py index bc3a475a7..66a89ac19 100644 --- a/selectinf/randomized/tests/test_slope.py +++ b/selectinf/randomized/tests/test_slope.py @@ -55,9 +55,9 @@ def slope_R(X, Y, W = None, normalize = True, choice_weights = "gaussian", sigma if W is None: r_W = robjects.NA_Logical - if choice_weights is "gaussian": + if choice_weights == "gaussian": r_choice_weights = robjects.StrVector('gaussian') - elif choice_weights is "bh": + elif choice_weights == "bh": r_choice_weights = robjects.StrVector('bh') else: r_W = robjects.r.matrix(W, nrow=p, ncol=1) From 5df425bba0828219f7ace4ba449240ceee82c807 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 15 Mar 2021 17:53:31 -0700 Subject: [PATCH 077/187] standalone functions for lasso inference; added nongaussian split-lasso classes; a test for randomized lasso; --- selectinf/randomized/approx_reference.py | 32 +-- selectinf/randomized/lasso.py | 256 +++++++++++++++--- selectinf/randomized/query.py | 16 +- selectinf/randomized/tests/test_lasso.py | 71 ++++- .../tests/test_standalone_lasso_mle.py | 195 +++++++++++++ 5 files changed, 512 insertions(+), 58 deletions(-) create mode 100644 selectinf/randomized/tests/test_standalone_lasso_mle.py diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index af8b936c8..f253ed01a 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -9,10 +9,16 @@ class approximate_grid_inference(object): def __init__(self, - query, observed_target, target_cov, target_score_cov, + inverse_info, + init_soln, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset, solve_args={'tol':1.e-12}): """ @@ -41,27 +47,19 @@ def __init__(self, """ self.solve_args = solve_args - - result, inverse_info = query.selective_MLE(observed_target, - target_cov, - target_score_cov, - solve_args=solve_args)[:2] - mle = result['MLE'] - self.linear_part = query.sampler.affine_con.linear_part - self.offset = query.sampler.affine_con.offset - - self.logdens_linear = query.sampler.logdens_transform[0] - self.cond_mean = query.cond_mean - self.prec_opt = np.linalg.inv(query.cond_cov) - self.cond_cov = query.cond_cov - + self.init_soln = init_soln + self.cond_mean = cond_mean + self.cond_cov = cond_cov + self.prec_opt = np.linalg.inv(self.cond_cov) + + self.logdens_linear = logdens_linear + self.linear_part = linear_part + self.offset = offset self.observed_target = observed_target self.target_score_cov = target_score_cov self.target_cov = target_cov - self.init_soln = query.observed_opt_state - self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) ngrid = 40 diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index f06e837eb..e24243386 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -162,20 +162,38 @@ def fit(self, # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator X, y = self.loglike.data - W = self._W = self.loglike.saturated_loss.hessian(X.dot(beta_bar)) - _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) - _hessian_unpen = np.dot(X.T, X[:, unpenalized] * W[:, None]) + linpred = X.dot(beta_bar) + n = linpred.shape[0] + if hasattr(self.loglike.saturated_loss, "hessian"): # a GLM -- all we need is W + W = self._W = self.loglike.saturated_loss.hessian(linpred) + _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) + _hessian_unpen = np.dot(X.T, X[:, unpenalized] * W[:, None]) + elif hasattr(self.loglike.saturated_loss, "hessian_mult"): + active_right = np.zeros((n, active.sum())) + for i, j in enumerate(np.nonzero(active)[0]): + active_right[:,i] = self.loglike.saturated_loss.hessian_mult(linpred, + X[:,j], + case_weights=self.loglike.saturated_loss.case_weights) + unpen_right = np.zeros((n, unpenalized.sum())) + for i, j in enumerate(np.nonzero(unpenalized)[0]): + unpen_right[:,i] = self.loglike.saturated_loss.hessian_mult(linpred, + X[:,j], + case_weights=self.loglike.saturated_loss.case_weights) + _hessian_active = X.T.dot(active_right) + _hessian_unpen = X.T.dot(unpen_right) + else: + raise ValueError('saturated_loss has no hessian or hessian_mult method') _score_linear_term = -np.hstack([_hessian_active, _hessian_unpen]) # set the observed score (data dependent) state # observed_score_state is - # \nabla \ell(\bar{\beta}_E) + Q(\bar{\beta}_E) \bar{\beta}_E + # \nabla \ell(\bar{\beta}_E) - Q(\bar{\beta}_E) \bar{\beta}_E # in linear regression this is _ALWAYS_ -X^TY # # should be asymptotically equivalent to - # \nabla \ell(\beta^*) + Q(\beta^*)\beta^* + # \nabla \ell(\beta^*) - Q(\beta^*)\beta^* self.observed_score_state = _score_linear_term.dot(_beta_unpenalized) self.observed_score_state[inactive] += self.loglike.smooth_objective(beta_bar, 'grad')[inactive] @@ -300,9 +318,6 @@ def gaussian(X, randomizer_scale : float Scale for IID components of randomizer. - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- @@ -381,9 +396,6 @@ def logistic(X, randomizer_scale : float Scale for IID components of randomizer. - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- @@ -463,16 +475,13 @@ def coxph(X, randomizer_scale : float Scale for IID components of randomizer. - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- L : `selection.randomized.lasso.lasso` """ - loglike = coxph_obj(X, times, status, quadratic=quadratic) + loglike = rr.glm.cox(X, times, status, quadratic=quadratic) # scale for randomization seems kind of meaningless here... @@ -536,9 +545,6 @@ def poisson(X, randomizer_scale : float Scale for IID components of randomizer. - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- @@ -620,9 +626,6 @@ def sqrt_lasso(X, randomizer_scale : float Scale for IID components of randomizer. - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- @@ -691,16 +694,21 @@ def selected_targets(loglike, features, sign_info={}, dispersion=None, - solve_args={'tol': 1.e-12, 'min_its': 50}): + solve_args={'tol': 1.e-12, 'min_its': 50}, + hessian=None): X, y = loglike.data n, p = X.shape Xfeat = X[:, features] - Qfeat = Xfeat.T.dot(W[:, None] * Xfeat) + if hessian is None: + Qfeat = Xfeat.T.dot(W[:, None] * Xfeat) + _score_linear = -Xfeat.T.dot(W[:, None] * X).T + else: + Qfeat = hessian[features][:,features] + _score_linear = -hessian[features].T observed_target = restricted_estimator(loglike, features, solve_args=solve_args) cov_target = np.linalg.inv(Qfeat) - _score_linear = -Xfeat.T.dot(W[:, None] * X).T crosscov_target_score = _score_linear.dot(cov_target) alternatives = ['twosided'] * features.sum() features_idx = np.arange(p)[features] @@ -823,7 +831,8 @@ def __init__(self, feature_weights, proportion_select, ridge_term=0, - perturb=None): + perturb=None, + estimate_dispersion=False): (self.loglike, self.feature_weights, @@ -836,11 +845,11 @@ def __init__(self, self.nfeature = p = self.loglike.shape[0] self.penalty = rr.weighted_l1norm(self.feature_weights, lagrange=1.) self._initial_omega = perturb + self.estimate_dispersion = estimate_dispersion def fit(self, solve_args={'tol': 1.e-12, 'min_its': 50}, - perturb=None, - estimate_dispersion=True): + perturb=None): signs = lasso.fit(self, solve_args=solve_args, @@ -851,7 +860,7 @@ def fit(self, # we then setup up the sampler again - if estimate_dispersion: + if self.estimate_dispersion: X, y = self.loglike.data n, p = X.shape @@ -864,7 +873,6 @@ def fit(self, # run setup again after # estimating dispersion - print(dispersion, 'dispersion') if df_fit > 0: self._setup_sampler(*self._setup_sampler_data, dispersion=dispersion) @@ -949,7 +957,7 @@ def gaussian(X, proportion, sigma=1., quadratic=None, - ridge_term=0): + estimate_dispersion=True): r""" Squared-error LASSO with feature weights. Objective function is (before randomization) @@ -977,6 +985,9 @@ def gaussian(X, `feature_weights` to 0. If `feature_weights` is a float, then all parameters are penalized equally. + proportion: float + What proportion of data to use for selection. + sigma : float (optional) Noise variance. Set to 1 if `covariance_estimator` is not None. This scales the loglikelihood by `sigma**(-2)`. @@ -986,12 +997,6 @@ def gaussian(X, Can also be a linear term by setting quadratic coefficient to 0. - randomizer_scale : float - Scale for IID components of randomizer. - - randomizer : str - One of ['laplace', 'logistic', 'gaussian'] - Returns ------- @@ -1003,10 +1008,185 @@ def gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic) - n, p = X.shape return split_lasso(loglike, - np.asarray(feature_weights) / sigma ** 2, + np.asarray(feature_weights)/sigma**2, + proportion, + estimate_dispersion=estimate_dispersion) + + + @staticmethod + def logistic(X, + successes, + feature_weights, + proportion, + trials=None, + quadratic=None): + r""" + Logistic LASSO with feature weights (before randomization) + + .. math:: + + \beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell$ is the negative of the logistic + log-likelihood (half the logistic deviance) + and $\lambda$ is `feature_weights`. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + successes : ndarray + Shape (n,) -- response vector. An integer number of successes. + For data that is proportions, multiply the proportions + by the number of trials first. + + feature_weights: [float, sequence] + Penalty weights. An intercept, or other unpenalized + features are handled by setting those entries of + `feature_weights` to 0. If `feature_weights` is + a float, then all parameters are penalized equally. + + proportion: float + What proportion of data to use for selection. + + trials : ndarray (optional) + Number of trials per response, defaults to + ones the same shape as Y. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.lasso.lasso` + + """ + + loglike = rr.glm.logistic(X, + successes, + trials=trials, + quadratic=quadratic) + + return split_lasso(loglike, + np.asarray(feature_weights), proportion) + @staticmethod + def coxph(X, + times, + status, + feature_weights, + proportion, + quadratic=None): + r""" + Cox proportional hazards LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + \beta \mapsto \ell^{\text{Cox}}(\beta) + + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell^{\text{Cox}}$ is the + negative of the log of the Cox partial + likelihood and $\lambda$ is `feature_weights`. + Uses Efron's tie breaking method. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + times : ndarray + Shape (n,) -- the survival times. + + status : ndarray + Shape (n,) -- the censoring status. + + feature_weights: [float, sequence] + Penalty weights. An intercept, or other unpenalized + features are handled by setting those entries of + `feature_weights` to 0. If `feature_weights` is + a float, then all parameters are penalized equally. + + proportion: float + What proportion of data to use for selection. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.lasso.lasso` + + """ + loglike = rr.glm.cox(X, times, status, quadratic=quadratic) + + return split_lasso(loglike, + np.asarray(feature_weights), + proportion) + + @staticmethod + def poisson(X, + counts, + feature_weights, + proportion, + quadratic=None, + ridge_term=None): + r""" + Poisson log-linear LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + + \beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell^{\text{Poisson}}$ is the negative + of the log of the Poisson likelihood (half the deviance) + and $\lambda$ is `feature_weights`. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + counts : ndarray + Shape (n,) -- the response. + + feature_weights: [float, sequence] + Penalty weights. An intercept, or other unpenalized + features are handled by setting those entries of + `feature_weights` to 0. If `feature_weights` is + a float, then all parameters are penalized equally. + + proportion: float + What proportion of data to use for selection. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.lasso.lasso` + + """ + loglike = rr.glm.poisson(X, counts, quadratic=quadratic) + + return split_lasso(loglike, + np.asarray(feature_weights), + proportion) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index df890a2ef..6d1bbecd7 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -418,11 +418,23 @@ def approximate_grid_inference(self, """ - G = approximate_grid_inference(self, - observed_target, + inverse_info = self.selective_MLE(observed_target, + target_cov, + target_score_cov, + solve_args=solve_args)[1] + + G = approximate_grid_inference(observed_target, target_cov, target_score_cov, + inverse_info, + self.observed_opt_state, + self.cond_mean, + self.cond_cov, + self.sampler.logdens_transform[0], + self.sampler.affine_con.linear_part, + self.sampler.affine_con.offset, solve_args=solve_args) + return G.summary(alternatives=alternatives) class multiple_queries(object): diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 5d0f2bd63..3a16411ec 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -6,7 +6,7 @@ import regreg.api as rr from ..lasso import lasso, selected_targets, full_targets, debiased_targets -from ...tests.instance import gaussian_instance +from ...tests.instance import gaussian_instance, logistic_instance from ...tests.flags import SET_SEED from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso @@ -355,6 +355,75 @@ def Rpval(X, Y, W, noise_scale=None): assert np.linalg.norm(conv.sampler.affine_con.covariance - cond_cov) / np.linalg.norm(cond_cov) < 1.e-3 assert np.linalg.norm(conv.sampler.affine_con.mean - cond_mean[:,0]) / np.linalg.norm(cond_mean[:,0]) < 1.e-3 +def test_logistic_lasso(n=500, + p=200, + signal_fac=1.5, + s=5, + full=True, + rho=0.4, + randomizer_scale=1., + ndraw=5000, + burnin=1000, + ridge_term=None, compare_to_lasso=True): + """ + Compare to R randomized lasso + """ + + inst, const = logistic_instance, lasso.logistic + signal = np.sqrt(signal_fac * 2 * np.log(p)) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + if ridge_term is None: + mean_diag = np.mean((X**2).sum(0)) + ridge_term = (np.sqrt(mean_diag) / np.sqrt(n)) * np.sqrt(n / (n - 1.)) + + W = np.ones(X.shape[1]) * choose_lambda(X) * 0.7 + + perturb = np.random.standard_normal(p) * randomizer_scale / np.sqrt(n) + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale / np.sqrt(n), + ridge_term=ridge_term) + + signs = conv.fit() + nonzero = signs != 0 + + # sanity check + + if full: + (observed_target, + cov_target, + cov_target_score, + alternatives) = full_targets(conv.loglike, + conv._W, + nonzero) + else: + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero) + + result = conv.summary(observed_target, + cov_target, + cov_target_score, + alternatives, + ndraw=ndraw, + burnin=burnin, + compute_intervals=False) + pval = result['pvalue'] + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] + def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True): diff --git a/selectinf/randomized/tests/test_standalone_lasso_mle.py b/selectinf/randomized/tests/test_standalone_lasso_mle.py new file mode 100644 index 000000000..c01879f4f --- /dev/null +++ b/selectinf/randomized/tests/test_standalone_lasso_mle.py @@ -0,0 +1,195 @@ +from __future__ import division, print_function + +import numpy as np +import nose.tools as nt + +import regreg.api as rr + +from selectinf.randomized.lasso import split_lasso, selected_targets +from selectinf.randomized.query import selective_MLE +from selectinf.randomized.approx_reference import approximate_grid_inference + +def test_standalone_inference(n=2000, + p=100, + signal_fac=1.5, + proportion=0.7, + approx=True, + MLE=True): + """ + Compare to R randomized lasso + """ + + signal = np.sqrt(signal_fac * np.log(p)) / np.sqrt(n) + X = np.random.standard_normal((n, p)) + T = np.random.exponential(1, size=(n,)) + S = np.random.choice([0,1], n, p=[0.2,0.8]) + + cox_lasso = split_lasso.coxph(X, + T, + S, + 2 * np.ones(p) * np.sqrt(n), + proportion) + + signs = cox_lasso.fit() + nonzero = signs != 0 + + cox_sel = rr.glm.cox(X[:,nonzero], T, S) + + cox_full = rr.glm.cox(X, T, S) + + refit_soln = cox_sel.solve(min_its=2000) + padded_soln = np.zeros(p) + padded_soln[nonzero] = refit_soln + cox_full.solve(min_its=2000) + + full_hess = cox_full.hessian(padded_soln) + selected_hess = full_hess[nonzero][:,nonzero] + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(cox_lasso.loglike, + None, + nonzero, + hessian=full_hess, + dispersion=1) + + if nonzero.sum(): + if approx: + approx_result = cox_lasso.approximate_grid_inference(observed_target, + cov_target, + cov_target_score) + approx_pval = approx_result['pvalue'] + + testval = approximate_normalizer_inference(proportion, + cox_lasso.initial_soln[nonzero], + refit_soln, + signs[nonzero], + selected_hess, + cox_lasso.feature_weights[nonzero]) + + assert np.allclose(testval['pvalue'], approx_pval) + + else: + approx_pval = np.empty(nonzero.sum())*np.nan + + if MLE: + MLE_result = cox_lasso.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + MLE_pval = MLE_result['pvalue'] + else: + MLE_pval = np.empty(nonzero.sum())*np.nan + + # working under null here + beta = np.zeros(p) + + testval = approximate_mle_inference(proportion, + cox_lasso.initial_soln[nonzero], + refit_soln, + signs[nonzero], + selected_hess, + cox_lasso.feature_weights[nonzero]) + + assert np.allclose(testval['pvalue'], MLE_pval) + return approx_pval[beta[nonzero] == 0], MLE_pval[beta[nonzero] == 0], testval + else: + return [], [] + +def approximate_mle_inference(training_proportion, + training_betahat, + selected_beta_refit, + selected_signs, + selected_hessian, + selected_feature_weights, + level=0.9): + + nselect = selected_hessian.shape[0] + pi_s = training_proportion + ratio = (1 - pi_s) / pi_s + + target_cov = np.linalg.inv(selected_hessian) + cond_precision = selected_hessian / ratio + cond_cov = target_cov * ratio + cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] + selected_signs[np.isnan(selected_signs)] = 1 # for unpenalized + + logdens_linear = target_cov * selected_signs[:,None] + cond_mean = selected_beta_refit * selected_signs - logdens_linear.dot( + selected_feature_weights * + selected_signs) + linear_part = -np.identity(nselect) + offset = np.zeros(nselect) + + target_score_cov = -np.identity(nselect) + observed_target = selected_beta_refit + + result = selective_MLE(observed_target, + target_cov, + target_score_cov, + training_betahat * selected_signs, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset, + level=level, + useC=True)[0] + + return result + +def approximate_normalizer_inference(training_proportion, + training_betahat, + selected_beta_refit, + selected_signs, + selected_hessian, + selected_feature_weights, + alternatives=None, + level=0.9): + + nselect = selected_hessian.shape[0] + pi_s = training_proportion + ratio = (1 - pi_s) / pi_s + + target_cov = np.linalg.inv(selected_hessian) + cond_precision = selected_hessian / ratio + cond_cov = target_cov * ratio + cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] + selected_signs[np.isnan(selected_signs)] = 1 # for unpenalized + + logdens_linear = target_cov * selected_signs[:,None] + cond_mean = selected_beta_refit * selected_signs - logdens_linear.dot( + selected_feature_weights * + selected_signs) + linear_part = -np.identity(nselect) + offset = np.zeros(nselect) + + target_score_cov = -np.identity(nselect) + observed_target = selected_beta_refit + + inverse_info = selective_MLE(observed_target, + target_cov, + target_score_cov, + training_betahat * selected_signs, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset, + level=level, + useC=True)[1] + + G = approximate_grid_inference(observed_target, + target_cov, + target_score_cov, + inverse_info, + training_betahat * selected_signs, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset) + + return G.summary(alternatives=alternatives, + level=level) + From 8b595e18acd20e124f00ed4ae31e60b0ca7a8b04 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 15 Mar 2021 17:54:01 -0700 Subject: [PATCH 078/187] signature of logistic instance --- selectinf/tests/instance.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/selectinf/tests/instance.py b/selectinf/tests/instance.py index 15826a148..0c5662b21 100644 --- a/selectinf/tests/instance.py +++ b/selectinf/tests/instance.py @@ -140,7 +140,11 @@ def _noise(n, df=np.inf): return X, Y, beta * sigma, np.nonzero(active)[0], sigma, sigmaX -def logistic_instance(n=100, p=200, s=7, rho=0.3, signal=14, +def logistic_instance(n=100, + p=200, + s=7, + rho=0.3, + signal=14, random_signs=False, scale=True, center=True, From cffdb0f28da4b4241c64c45ff06f81996a0385fa Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 15 Mar 2021 18:03:44 -0700 Subject: [PATCH 079/187] fixing approxiamte reference tests --- .../randomized/tests/test_approx_reference.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index fbf57dd13..f2572884b 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -50,6 +50,10 @@ def test_summary(n=500, nonzero, dispersion=dispersion) + inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[1] + S = conv.approximate_grid_inference(observed_target, cov_target, cov_target_score, @@ -105,10 +109,16 @@ def test_approx_pivot(n=500, cov_target, cov_target_score)[1] - approximate_grid_inf = approximate_grid_inference(conv, - observed_target, + approximate_grid_inf = approximate_grid_inference(observed_target, cov_target, - cov_target_score) + cov_target_score, + inverse_info, + conv.observed_opt_state, + conv.sampler.affine_con.mean, + conv.sampler.affine_con.covariance, + conv.sampler.logdens_transform[0], + conv.sampler.affine_con.linear_part, + conv.sampler.affine_con.offset) pivot = approximate_grid_inf._approx_pivots(beta_target) @@ -170,10 +180,16 @@ def test_approx_ci(n=500, scale_ = np.max(_scale) ngrid = int(2 * scale_/0.1) - approximate_grid_inf = approximate_grid_inference(conv, - observed_target, + approximate_grid_inf = approximate_grid_inference(observed_target, cov_target, - cov_target_score) + cov_target_score, + inverse_info, + conv.observed_opt_state, + conv.sampler.affine_con.mean, + conv.sampler.affine_con.covariance, + conv.sampler.logdens_transform[0], + conv.sampler.affine_con.linear_part, + conv.sampler.affine_con.offset) lci, uci = approximate_grid_inf._approx_intervals(level) From 13b8d2be6d1bf5db2f96c969be6be03cf4b73efb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 15 Mar 2021 18:06:40 -0700 Subject: [PATCH 080/187] changing docstring --- selectinf/randomized/tests/test_standalone_lasso_mle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/selectinf/randomized/tests/test_standalone_lasso_mle.py b/selectinf/randomized/tests/test_standalone_lasso_mle.py index c01879f4f..0d9b13e1e 100644 --- a/selectinf/randomized/tests/test_standalone_lasso_mle.py +++ b/selectinf/randomized/tests/test_standalone_lasso_mle.py @@ -16,7 +16,8 @@ def test_standalone_inference(n=2000, approx=True, MLE=True): """ - Compare to R randomized lasso + Check that standalone functions reproduce same p-values + as methods of `selectinf.randomized.lasso` """ signal = np.sqrt(signal_fac * np.log(p)) / np.sqrt(n) From 66fda017500809e47e90afcf7dbc99c3c39203d1 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 15 Mar 2021 18:12:52 -0700 Subject: [PATCH 081/187] fix nan signs --- selectinf/randomized/tests/test_standalone_lasso_mle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_standalone_lasso_mle.py b/selectinf/randomized/tests/test_standalone_lasso_mle.py index 0d9b13e1e..4151fa8a4 100644 --- a/selectinf/randomized/tests/test_standalone_lasso_mle.py +++ b/selectinf/randomized/tests/test_standalone_lasso_mle.py @@ -112,8 +112,8 @@ def approximate_mle_inference(training_proportion, target_cov = np.linalg.inv(selected_hessian) cond_precision = selected_hessian / ratio cond_cov = target_cov * ratio - cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] selected_signs[np.isnan(selected_signs)] = 1 # for unpenalized + cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] logdens_linear = target_cov * selected_signs[:,None] cond_mean = selected_beta_refit * selected_signs - logdens_linear.dot( @@ -155,8 +155,8 @@ def approximate_normalizer_inference(training_proportion, target_cov = np.linalg.inv(selected_hessian) cond_precision = selected_hessian / ratio cond_cov = target_cov * ratio - cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] selected_signs[np.isnan(selected_signs)] = 1 # for unpenalized + cond_cov = cond_cov * selected_signs[None, :] * selected_signs[:, None] logdens_linear = target_cov * selected_signs[:,None] cond_mean = selected_beta_refit * selected_signs - logdens_linear.dot( From 7926ad4b428cfa4d4f1098788324b7ea5ec7292a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 16 Mar 2021 12:33:21 -0700 Subject: [PATCH 082/187] ensuring Gaussian mle is scale invariant; testing mle for other families; add Cox instance generator --- selectinf/randomized/lasso.py | 24 +- selectinf/randomized/query.py | 4 +- selectinf/randomized/selective_MLE_utils.pyx | 10 +- .../tests/test_selective_MLE_high.py | 537 +++++++++++++++++- selectinf/tests/instance.py | 201 ++++++- 5 files changed, 727 insertions(+), 49 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index e24243386..35051b321 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -285,7 +285,7 @@ def gaussian(X, \beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + \sum_{i=1}^p \lambda_i |\beta_i| where $\lambda$ is `feature_weights`. The ridge term - is determined by the Hessian and `np.std(Y)` by default, + is determined by the Hessian by default, as is the randomizer scale. Parameters @@ -333,10 +333,10 @@ def gaussian(X, mean_diag = np.mean((X ** 2).sum(0)) if ridge_term is None: - ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1) + ridge_term = np.sqrt(mean_diag) / (np.sqrt(n - 1) * sigma**2) if randomizer_scale is None: - randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.)) + randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y, ddof=1) randomizer = randomization.isotropic_gaussian((p,), randomizer_scale) @@ -409,7 +409,7 @@ def logistic(X, mean_diag = np.mean((X ** 2).sum(0)) if ridge_term is None: - ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1) + ridge_term = np.sqrt(mean_diag) / np.sqrt(n - 1) if randomizer_scale is None: randomizer_scale = np.sqrt(mean_diag) * 0.5 @@ -481,6 +481,7 @@ def coxph(X, L : `selection.randomized.lasso.lasso` """ + n, p = X.shape loglike = rr.glm.cox(X, times, status, quadratic=quadratic) # scale for randomization seems kind of meaningless here... @@ -488,7 +489,7 @@ def coxph(X, mean_diag = np.mean((X ** 2).sum(0)) if ridge_term is None: - ridge_term = np.std(times) * np.sqrt(mean_diag) / np.sqrt(n - 1) + ridge_term = np.sqrt(mean_diag) / np.sqrt(n - 1) if randomizer_scale is None: randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.)) @@ -559,7 +560,7 @@ def poisson(X, mean_diag = np.mean((X ** 2).sum(0)) if ridge_term is None: - ridge_term = np.std(counts) * np.sqrt(mean_diag) / np.sqrt(n - 1) + ridge_term = np.sqrt(mean_diag) / np.sqrt(n - 1) if randomizer_scale is None: randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(counts) * np.sqrt(n / (n - 1.)) @@ -694,7 +695,7 @@ def selected_targets(loglike, features, sign_info={}, dispersion=None, - solve_args={'tol': 1.e-12, 'min_its': 50}, + solve_args={'tol': 1.e-12, 'min_its': 100}, hessian=None): X, y = loglike.data @@ -727,7 +728,8 @@ def full_targets(loglike, W, features, dispersion=None, - solve_args={'tol': 1.e-12, 'min_its': 50}): + solve_args={'tol': 1.e-12, 'min_its': 50}, + hessian=None): X, y = loglike.data n, p = X.shape @@ -738,6 +740,11 @@ def full_targets(loglike, # target is one-step estimator Qfull = X.T.dot(W[:, None] * X) + if hessian is None: + Qfull = X.T.dot(W[:, None] * X) + else: + Qfull = hessian + Qfull_inv = np.linalg.inv(Qfull) full_estimator = loglike.solve(**solve_args) cov_target = Qfull_inv[features][:, features] @@ -1131,6 +1138,7 @@ def coxph(X, L : `selection.randomized.lasso.lasso` """ + n, p = X.shape loglike = rr.glm.cox(X, times, status, quadratic=quadratic) return split_lasso(loglike, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 6d1bbecd7..c5ab43ff6 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1486,7 +1486,7 @@ def _solve_barrier_affine_py(conjugate_arg, min_its=200, tol=1.e-10): - scaling = np.sqrt(np.diag(con_linear.dot(precision).dot(con_linear.T))) + scaling = 1 / np.sqrt(np.diag(con_linear.dot(precision).dot(con_linear.T))) if feasible_point is None: feasible_point = 1. / scaling @@ -1555,7 +1555,7 @@ def _solve_barrier_nonneg(conjugate_arg, nstep=1000, tol=1.e-8): - scaling = np.sqrt(np.diag(precision)) + scaling = 1 / np.sqrt(np.diag(precision)) if feasible_point is None: feasible_point = 1. / scaling diff --git a/selectinf/randomized/selective_MLE_utils.pyx b/selectinf/randomized/selective_MLE_utils.pyx index 2aabbc365..363399a25 100644 --- a/selectinf/randomized/selective_MLE_utils.pyx +++ b/selectinf/randomized/selective_MLE_utils.pyx @@ -114,8 +114,8 @@ def solve_barrier_nonneg(conjugate_arg, gradient = np.zeros_like(conjugate_arg) opt_variable = np.asarray(feasible_point) opt_proposed = opt_variable.copy() - scaling = np.sqrt(np.diag(precision)) - + scaling = 1 / np.sqrt(np.diag(precision)) + return barrier_solve_(gradient, opt_variable, opt_proposed, @@ -143,7 +143,8 @@ def solve_barrier_affine(conjugate_arg, affine_term = np.zeros_like(offset) A = linear_term - scaling = np.sqrt(np.diag(A.dot(precision).dot(A.T))) + scaling = 1 / np.sqrt(np.diag(A.dot(precision).dot(A.T))) + linear_term_fortran = np.asfortranarray(linear_term) value, opt_variable, hess = barrier_solve_affine_(gradient, @@ -158,6 +159,7 @@ def solve_barrier_affine(conjugate_arg, step, max_iter=max_iter, min_iter=min_iter, - value_tol=tol) + value_tol=tol + ) return value, opt_variable, hess \ No newline at end of file diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 7ea737021..30faa1767 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -1,8 +1,18 @@ import numpy as np import nose.tools as nt -from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets -from selectinf.tests.instance import gaussian_instance +import regreg.api as rr + + +from ..lasso import (lasso, + split_lasso, + full_targets, + selected_targets, + debiased_targets) +from ...tests.instance import (gaussian_instance, + logistic_instance, + poisson_instance, + cox_instance) def test_full_targets(n=200, p=1000, @@ -12,7 +22,7 @@ def test_full_targets(n=200, randomizer_scale=0.5, full_dispersion=False): """ - Compare to R randomized lasso + Run approx MLE with full targets on Gaussian data """ inst, const = gaussian_instance, lasso.gaussian @@ -82,7 +92,6 @@ def test_full_targets(n=200, coverage = (beta[nonzero] > intervals[:, 0]) * (beta[nonzero] < intervals[:, 1]) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals - def test_selected_targets(n=2000, p=200, signal_fac=10., @@ -92,7 +101,7 @@ def test_selected_targets(n=2000, randomizer_scale=1, full_dispersion=True): """ - Compare to R randomized lasso + Run approx MLE with selected targets on Gaussian data """ inst, const = gaussian_instance, lasso.gaussian @@ -152,6 +161,513 @@ def test_selected_targets(n=2000, coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals +def test_logistic(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on binomial data + """ + + inst, const = logistic_instance, lasso.logistic + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_logistic_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on binomial data with data splitting + """ + + inst, const = logistic_instance, split_lasso.logistic + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_poisson(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on Poisson data + """ + + inst, const = poisson_instance, lasso.poisson + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_poisson_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on Poisson data with data splitting + """ + + inst, const = poisson_instance, split_lasso.poisson + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_cox(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on survival data + """ + + inst, const = cox_instance, lasso.coxph + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, T, S, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:4] + + n, p = X.shape + + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) + + conv = const(X, + T, + S, + W, + randomizer_scale=randomizer_scale) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + cox_full = rr.glm.cox(X, T, S) + full_hess = cox_full.hessian(conv.initial_soln) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + None, + nonzero, + hessian=full_hess, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_cox_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on survival data with data splitting + """ + + inst, const = cox_instance, split_lasso.coxph + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, T, S, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:4] + + n, p = X.shape + + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) + + conv = const(X, + T, + S, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + cox_full = rr.glm.cox(X, T, S) + full_hess = cox_full.hessian(conv.initial_soln) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + None, + nonzero, + hessian=full_hess, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_scale_invariant_split(n=200, + p=20, + signal_fac=10., + s=5, + sigma=3, + rho=0.4, + randomizer_scale=1, + full_dispersion=True, + seed=2): + """ + Confirm Gaussian version is appropriately scale invariant with data splitting + """ + + inst, const = gaussian_instance, split_lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + results = [] + + scales = [1, 5] + for scale in scales: + + np.random.seed(seed) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + Y *= scale; beta *= scale + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + print('W', W[0]/scale) + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print('nonzero', np.where(nonzero)[0]) + print('feature_weights', conv.feature_weights[0] / scale) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + print('dispersion', dispersion/scale**2) + print('target', observed_target[0]/scale) + print('cov_target', cov_target[0,0]/scale**2) + print('cov_target_score', cov_target_score[0,0]/scale**2) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + print(result['MLE'] / scale) + results.append(result) + + assert np.allclose(results[0]['MLE'] / scales[0], + results[1]['MLE'] / scales[1]) + assert np.allclose(results[0]['SE'] / scales[0], + results[1]['SE'] / scales[1]) + assert np.allclose(results[0]['upper_confidence'] / scales[0], + results[1]['upper_confidence'] / scales[1]) + assert np.allclose(results[0]['lower_confidence'] / scales[0], + results[1]['lower_confidence'] / scales[1]) + assert np.allclose(results[0]['Zvalue'], + results[1]['Zvalue']) + assert np.allclose(results[0]['pvalue'], + results[1]['pvalue']) + +def test_scale_invariant(n=200, + p=20, + signal_fac=10., + s=5, + sigma=3, + rho=0.4, + randomizer_scale=1, + full_dispersion=True, + seed=2): + """ + Confirm Gaussian version is appropriately scale invariant + """ + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + results = [] + + scales = [1, 5] + for scale in scales: + + np.random.seed(seed) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + Y *= scale; beta *= scale + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + print('W', W[0]/scale) + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print('nonzero', np.where(nonzero)[0]) + print('feature_weights', conv.feature_weights[0] / scale) + print('perturb', conv._initial_omega[0] / scale) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + print('dispersion', dispersion/scale**2) + print('target', observed_target[0]/scale) + print('cov_target', cov_target[0,0]/scale**2) + print('cov_target_score', cov_target_score[0,0]/scale**2) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + print(result['MLE'] / scale) + results.append(result) + + assert np.allclose(results[0]['MLE'] / scales[0], + results[1]['MLE'] / scales[1]) + assert np.allclose(results[0]['SE'] / scales[0], + results[1]['SE'] / scales[1]) + assert np.allclose(results[0]['upper_confidence'] / scales[0], + results[1]['upper_confidence'] / scales[1]) + assert np.allclose(results[0]['lower_confidence'] / scales[0], + results[1]['lower_confidence'] / scales[1]) + assert np.allclose(results[0]['Zvalue'], + results[1]['Zvalue']) + assert np.allclose(results[0]['pvalue'], + results[1]['pvalue']) + def main(nsim=500, full=False): P0, PA, cover, length_int = [], [], [], [] @@ -222,14 +738,3 @@ def test_instance(): return coverage -def main(nsim=500): - - cover = [] - for i in range(nsim): - - cover_ = test_instance() - cover.extend(cover_) - print(np.mean(cover), 'coverage so far ') - -if __name__ == "__main__": - main(nsim=500) diff --git a/selectinf/tests/instance.py b/selectinf/tests/instance.py index 0c5662b21..5035518c6 100644 --- a/selectinf/tests/instance.py +++ b/selectinf/tests/instance.py @@ -31,9 +31,16 @@ def AR1(rho, p): X = np.random.standard_normal((n, p)).dot(cholX.T) return X, sigmaX, cholX -def gaussian_instance(n=100, p=200, s=7, sigma=5, rho=0., signal=7, - random_signs=False, df=np.inf, - scale=True, center=True, +def gaussian_instance(n=100, + p=200, + s=7, + sigma=5, + rho=0., + signal=7, + random_signs=False, + df=np.inf, + scale=True, + center=True, equicorrelated=True): @@ -61,14 +68,13 @@ def gaussian_instance(n=100, p=200, s=7, sigma=5, rho=0., signal=7, sigma : float Noise level - rho : float - Equicorrelation value (must be in interval [0,1]) + rho : float + Correlation parameter. Must be in interval [0,1] for + equicorrelated, [-1,1] for AR(1). signal : float or (float, float) Sizes for the coefficients. If a tuple -- then coefficients are equally spaced between these values using np.linspace. - Note: the size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). - If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. random_signs : bool If true, assign random signs to coefficients. @@ -77,9 +83,15 @@ def gaussian_instance(n=100, p=200, s=7, sigma=5, rho=0., signal=7, df : int Degrees of freedom for noise (from T distribution). - equicorrelated: bool - If true, design in equi-correlated, - Else design is AR. + scale : bool + Scale columns of design matrix? + + center : bool + Center columns of design matrix? + + equicorrelated : bool + Should columns of design be equi-correlated + or AR(1)? Returns ------- @@ -101,6 +113,13 @@ def gaussian_instance(n=100, p=200, s=7, sigma=5, rho=0., signal=7, sigmaX : np.ndarray((p,p)) Row covariance. + + Notes + ----- + + The size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). + If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. + """ X, sigmaX = _design(n, p, rho, equicorrelated)[:2] @@ -166,19 +185,28 @@ def logistic_instance(n=100, s : int True sparsity - rho : float - Equicorrelation value (must be in interval [0,1]) + rho : float + Correlation parameter. Must be in interval [0,1] for + equicorrelated, [-1,1] for AR(1). signal : float or (float, float) Sizes for the coefficients. If a tuple -- then coefficients are equally spaced between these values using np.linspace. - Note: the size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). - If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. random_signs : bool If true, assign random signs to coefficients. Else they are all positive. + scale : bool + Scale columns of design matrix? + + center : bool + Center columns of design matrix? + + equicorrelated : bool + Should columns of design be equi-correlated + or AR(1)? + Returns ------- @@ -197,6 +225,11 @@ def logistic_instance(n=100, sigmaX : np.ndarray((p,p)) Row covariance. + Notes + ----- + + The size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). + If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. """ X, sigmaX = _design(n, p, rho, equicorrelated)[:2] @@ -230,7 +263,11 @@ def logistic_instance(n=100, Y = np.random.binomial(1, pi) return X, Y, beta, np.nonzero(active)[0], sigmaX -def poisson_instance(n=100, p=200, s=7, rho=0.3, signal=4, +def poisson_instance(n=100, + p=200, + s=7, + rho=0.3, + signal=4, random_signs=False, scale=True, center=True, @@ -252,19 +289,28 @@ def poisson_instance(n=100, p=200, s=7, rho=0.3, signal=4, s : int True sparsity - rho : float - Equicorrelation value (must be in interval [0,1]) + rho : float + Correlation parameter. Must be in interval [0,1] for + equicorrelated, [-1,1] for AR(1). signal : float or (float, float) Sizes for the coefficients. If a tuple -- then coefficients are equally spaced between these values using np.linspace. - Note: the size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). - If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. random_signs : bool If true, assign random signs to coefficients. Else they are all positive. + scale : bool + Scale columns of design matrix? + + center : bool + Center columns of design matrix? + + equicorrelated : bool + Should columns of design be equi-correlated + or AR(1)? + Returns ------- @@ -283,6 +329,11 @@ def poisson_instance(n=100, p=200, s=7, rho=0.3, signal=4, sigmaX : np.ndarray((p,p)) Row covariance. + Notes + ----- + + The size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). + If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. """ X, sigmaX = _design(n, p, rho, equicorrelated)[:2] @@ -316,6 +367,118 @@ def poisson_instance(n=100, p=200, s=7, rho=0.3, signal=4, Y = np.random.poisson(mu) return X, Y, beta, np.nonzero(active)[0], sigmaX +def cox_instance(n=100, + p=200, + s=7, + rho=0.3, + signal=4, + random_signs=False, + scale=True, + center=True, + p_censor=0.1, + equicorrelated=True): + """A testing instance for the LASSO. + Design is equi-correlated in the population, + normalized to have columns of norm 1. + + Parameters + ---------- + + n : int + Sample size + + p : int + Number of features + + s : int + True sparsity + + rho : float + Correlation parameter. Must be in interval [0,1] for + equicorrelated, [-1,1] for AR(1). + + signal : float or (float, float) + Sizes for the coefficients. If a tuple -- then coefficients + are equally spaced between these values using np.linspace. + + random_signs : bool + If true, assign random signs to coefficients. + Else they are all positive. + + scale : bool + Scale columns of design matrix? + + center : bool + Center columns of design matrix? + + equicorrelated : bool + Should columns of design be equi-correlated + or AR(1)? + + p_censor : float + Probability of right-censorship. + + Returns + ------- + + X : np.float((n,p)) + Design matrix. + + T : np.float(n) + Response vector of times. + + S : np.bool(n) + Right-censoring status. + + beta : np.float(p) + True coefficients. + + active : np.int(s) + Non-zero pattern. + + sigmaX : np.ndarray((p,p)) + Row covariance. + + Notes + ----- + + The size of signal is for a "normalized" design, where np.diag(X.T.dot(X)) == np.ones(p). + If scale=False, this signal is divided by np.sqrt(n), otherwise it is unchanged. + + """ + + X, sigmaX = _design(n, p, rho, equicorrelated)[:2] + + if center: + X -= X.mean(0)[None,:] + + beta = np.zeros(p) + signal = np.atleast_1d(signal) + if signal.shape == (1,): + beta[:s] = signal[0] + else: + beta[:s] = np.linspace(signal[0], signal[1], s) + if random_signs: + beta[:s] *= (2 * np.random.binomial(1, 0.5, size=(s,)) - 1.) + np.random.shuffle(beta) + beta /= np.sqrt(n) + + if scale: + scaling = X.std(0) * np.sqrt(n) + X /= scaling[None, :] + beta *= np.sqrt(n) + sigmaX = sigmaX / np.multiply.outer(scaling, scaling) + + active = np.zeros(p, np.bool) + active[beta != 0] = True + + eta = linpred = np.dot(X, beta) + mu = np.exp(eta) + + T = np.random.exponential(mu) + S = np.random.choice([0,1], n, p=[p_censor,1-p_censor]) + return X, T, S, beta, np.nonzero(active)[0], sigmaX + def HIV_NRTI(drug='3TC', standardize=True, datafile=None, From 812d4e1ea77eb6e0fe24a8cbf2ae3be4bfce828a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 19 Apr 2021 13:55:29 -0400 Subject: [PATCH 083/187] new pivots based on exact reference --- selectinf/randomized/exact_reference.py | 272 ++++++++++++++++++ .../randomized/tests/test_exact_reference.py | 92 ++++++ 2 files changed, 364 insertions(+) create mode 100644 selectinf/randomized/exact_reference.py create mode 100644 selectinf/randomized/tests/test_exact_reference.py diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py new file mode 100644 index 000000000..7626ea182 --- /dev/null +++ b/selectinf/randomized/exact_reference.py @@ -0,0 +1,272 @@ +from __future__ import division, print_function + +import numpy as np, pandas as pd +from scipy.interpolate import interp1d +from scipy.stats import norm as ndist + +from ..distributions.discrete_family import discrete_family + +class exact_grid_inference(object): + + def __init__(self, + query, + observed_target, + target_cov, + target_score_cov, + solve_args={'tol': 1.e-12}): + + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + query : `gaussian_query` + A Gaussian query which has information + to describe implied Gaussian. + + observed_target : ndarray + Observed estimate of target. + + target_cov : ndarray + Estimated covaraince of target. + + target_score_cov : ndarray + Estimated covariance of target and score of randomized query. + + solve_args : dict, optional + Arguments passed to solver. + + """ + + self.solve_args = solve_args + + result, inverse_info = query.selective_MLE(observed_target, + target_cov, + target_score_cov, + solve_args=solve_args)[:2] + mle = result['MLE'] + + self.linear_part = query.sampler.affine_con.linear_part + self.offset = query.sampler.affine_con.offset + + self.logdens_linear = query.sampler.logdens_transform[0] + self.cond_mean = query.cond_mean + self.prec_opt = np.linalg.inv(query.cond_cov) + self.cond_cov = query.cond_cov + + self.observed_target = observed_target + self.target_score_cov = target_score_cov + self.target_cov = target_cov + + self.init_soln = query.observed_opt_state + + self.ntarget = ntarget = target_cov.shape[0] + _scale = 4. * np.sqrt(np.diag(inverse_info)) + ngrid = 60 + + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1. * _scale[j], + observed_target[j] + 1. * _scale[j], + num=ngrid) + + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + + Parameters + ---------- + + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + + level : float + Confidence level. + + """ + + if parameter is not None: + pivots = self.approx_pivots(parameter, + alternatives=alternatives) + else: + pivots = None + + pvalues = self._approx_pivots(np.zeros_like(self.observed_target), + alternatives=alternatives) + lower, upper = self._approx_intervals(level=level) + + result = pd.DataFrame({'target': self.observed_target, + 'pvalue': pvalues, + 'lower_confidence': lower, + 'upper_confidence': upper}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result + + def log_reference(self, + observed_target, + target_cov, + target_score_cov, + grid): + + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + prec_target = np.linalg.inv(target_cov) + target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) + + ref_hat = [] + + for k in range(grid.shape[0]): + # in the usual D = N + Gamma theta.hat, + # target_lin is "something" times Gamma, + # where "something" comes from implied Gaussian + # cond_mean is "something" times D + # Gamma is target_score_cov.T.dot(prec_target) + + num_opt = self.prec_opt.shape[0] + num_con = self.linear_part.shape[0] + + cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + self.cond_mean) + + #direction for decomposing o + + eta = -self.prec_opt.dot(self.logdens_linear.dot(target_score_cov.T)) + + implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) + implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) + implied_prec = 1./implied_cov + + _A = self.cond_cov.dot(eta) * implied_prec + A = self.linear_part.dot(_A).reshape((-1,)) + b = self.linear_part.dot((-np.identity(num_opt) + _A.dot(eta.T)).dot(self.init_soln)) + + neg_indx = np.asarray([j for j in range(num_con) if A[j] < 0.]) + pos_indx = np.asarray([j for j in range(num_con) if A[j] > 0.]) + + trunc_ = (self.offset + b) / A + + if pos_indx.shape[0]>0 and neg_indx.shape[0]>0: + + trunc_lower = np.max(trunc_[neg_indx]) + trunc_upper = np.min(trunc_[pos_indx]) + + lower_limit = (trunc_lower - implied_mean) * implied_prec + upper_limit = (trunc_upper - implied_mean) * implied_prec + + ref_hat.append(np.log(ndist.cdf(upper_limit) - ndist.cdf(lower_limit))) + + elif pos_indx.shape[0] == num_con: + + trunc_upper = np.min(trunc_[pos_indx]) + + upper_limit = (trunc_upper - implied_mean) * implied_prec + + ref_hat.append(np.log(ndist.cdf(upper_limit))) + + else: + + trunc_lower = np.max(trunc_[neg_indx]) + + lower_limit = (trunc_lower - implied_mean) * implied_prec + + ref_hat.append(np.log(1. - ndist.cdf(lower_limit))) + + return np.asarray(ref_hat) + + def _construct_families(self): + + self._families = [] + for m in range(self.ntarget): + p = self.target_score_cov.shape[1] + observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + var_target = target_cov_uni[0, 0] + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + + log_ref = self.log_reference(observed_target_uni, + target_cov_uni, + target_score_cov_uni, + self.stat_grid[m]) + + grid_approx_fn = interp1d(self.stat_grid[m], + log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') + + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (grid_approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + + # construction of families follows `selectinf.learning.core` + + self._families.append(discrete_family(grid, + np.exp(logW))) + + def _pivots(self, + mean_parameter, + alternatives=None): + + if not hasattr(self, "_families"): + self._construct_families() + + if alternatives is None: + alternatives = ['twosided'] * self.ntarget + else: + alternatives = [alternatives] *self.ntarget + pivot = [] + + for m in range(self.ntarget): + family = self._families[m] + observed_target = self.observed_target[m] + var_target = self.target_cov[m, m] + + # construction of pivot from families follows `selectinf.learning.core` + + _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, + x=observed_target) + if alternatives[m] == 'twosided': + pivot.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pivot.append(1 - _cdf) + elif alternatives[m] == 'less': + pivot.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') + return pivot + + def _intervals(self, + level=0.9): + + if not hasattr(self, "_families"): + self._construct_families() + + lower, upper = [], [] + + for m in range(self.ntarget): + # construction of intervals from families follows `selectinf.learning.core` + family = self._families[m] + observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, + alpha=1 - level) + var_target = self.target_cov[m, m] + lower.append(l * var_target + observed_target) + upper.append(u * var_target + observed_target) + + return np.asarray(lower), np.asarray(upper) \ No newline at end of file diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py new file mode 100644 index 000000000..b8835ecb2 --- /dev/null +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -0,0 +1,92 @@ +import numpy as np + +from ...tests.instance import gaussian_instance +from ..lasso import lasso, selected_targets +from ..exact_reference import exact_grid_inference + +def test_approx_pivot(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1.): + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * dispersion) + + signs = conv.fit() + nonzero = signs != 0 + print("size of selected set ", nonzero.sum()) + + if nonzero.sum()>0: + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + exact_grid_inf = exact_grid_inference(conv, + observed_target, + cov_target, + cov_target_score) + + pivot = exact_grid_inf._pivots(beta_target) + + return pivot + + + +def main(nsim=300): + + import matplotlib as mpl + mpl.use('tkagg') + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF + + _pivot = [] + for i in range(nsim): + _pivot.extend(test_approx_pivot(n=500, + p=100, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.50, + randomizer_scale=1.)) + + print("iteration completed ", i) + + plt.clf() + ecdf_pivot = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + +if __name__ == "__main__": + main(nsim=10) \ No newline at end of file From 71bf5f1fc23e7ab8e1941dac0d0496164342dce5 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 19 Apr 2021 15:48:35 -0400 Subject: [PATCH 084/187] added test for ci --- selectinf/randomized/exact_reference.py | 2 +- .../randomized/tests/test_exact_reference.py | 123 +++++++++++++++--- 2 files changed, 104 insertions(+), 21 deletions(-) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 7626ea182..5e5c43db8 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -64,7 +64,7 @@ def __init__(self, self.ntarget = ntarget = target_cov.shape[0] _scale = 4. * np.sqrt(np.diag(inverse_info)) - ngrid = 60 + ngrid = 40 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index b8835ecb2..23a091b70 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -60,33 +60,116 @@ def test_approx_pivot(n=500, return pivot +def test_approx_ci(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1., + level=0.9): + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * dispersion) -def main(nsim=300): + signs = conv.fit() + nonzero = signs != 0 + + if nonzero.sum()>0: + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + result, inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[:2] + + exact_grid_inf = exact_grid_inference(conv, + observed_target, + cov_target, + cov_target_score) + + lci, uci = exact_grid_inf._intervals(level) + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + + return np.mean(coverage), np.mean(length), np.mean(length-(3.3 * np.sqrt(np.diag(inverse_info)))) + +def main(nsim=300, CI=False): import matplotlib as mpl mpl.use('tkagg') import matplotlib.pyplot as plt from statsmodels.distributions.empirical_distribution import ECDF - _pivot = [] - for i in range(nsim): - _pivot.extend(test_approx_pivot(n=500, - p=100, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.50, - randomizer_scale=1.)) - - print("iteration completed ", i) - - plt.clf() - ecdf_pivot = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') - plt.plot(grid, grid, 'k--') - plt.show() + if CI is False: + _pivot = [] + for i in range(nsim): + _pivot.extend(test_approx_pivot(n=500, + p=100, + signal_fac=0.5, + s=5, + sigma=3., + rho=0.50, + randomizer_scale=0.7)) + + print("iteration completed ", i) + + plt.clf() + ecdf_pivot = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + + if CI is True: + coverage_ = 0. + length_ = 0. + length_diff_ = 0. + for n in range(nsim): + cov, len, len_diff = test_approx_ci(n=500, + p=100, + signal_fac=1., + s=5, + sigma=3., + rho=0.50, + randomizer_scale=1.) + + coverage_ += cov + length_ += len + length_diff_ += len_diff + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.), length_diff_/(n+1.)) + print("iteration completed ", n + 1) + if __name__ == "__main__": - main(nsim=10) \ No newline at end of file + main(nsim=50, CI=True) \ No newline at end of file From c8d31900d0f11192b26b25b0a2bfed11137ca467 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 28 Apr 2021 09:59:51 -0400 Subject: [PATCH 085/187] adding level argument for approximate reference --- selectinf/randomized/query.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index c5ab43ff6..b45f59978 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -393,6 +393,7 @@ def approximate_grid_inference(self, target_cov, target_score_cov, alternatives=None, + level=0.9, solve_args={'tol': 1.e-12}): """ @@ -413,6 +414,9 @@ def approximate_grid_inference(self, Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] + level : float, optional + Confidence level. + solve_args : dict, optional Arguments passed to solver. @@ -435,7 +439,8 @@ def approximate_grid_inference(self, self.sampler.affine_con.offset, solve_args=solve_args) - return G.summary(alternatives=alternatives) + return G.summary(alternatives=alternatives, + level=level) class multiple_queries(object): From 80c0cd840b307f2e67b3bb066e9a56bc62fd492b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 29 Apr 2021 09:22:44 -0400 Subject: [PATCH 086/187] tests for new equicorrelated instance --- .../randomized/tests/test_exact_reference.py | 31 +++---- .../tests/test_selective_MLE_high.py | 81 +++++++++++++++++-- 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 23a091b70..c023b0d65 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -17,23 +17,26 @@ def test_approx_pivot(n=500, X, Y, beta = inst(n=n, p=p, - signal=signal, + signal=0, s=s, - equicorrelated=False, + equicorrelated=True, rho=rho, sigma=sigma, - random_signs=True)[:3] + random_signs=False)[:3] n, p = X.shape sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + #dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + dispersion = sigma_ ** 2 - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) + #W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) + eps = np.random.standard_normal((n, 2000)) * Y.std() + lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) conv = const(X, Y, - W, + lam_theory * np.ones(p), randomizer_scale=randomizer_scale * dispersion) signs = conv.fit() @@ -49,7 +52,7 @@ def test_approx_pivot(n=500, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, - dispersion=dispersion) + dispersion=None) exact_grid_inf = exact_grid_inference(conv, observed_target, @@ -133,12 +136,12 @@ def main(nsim=300, CI=False): if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=500, - p=100, - signal_fac=0.5, - s=5, - sigma=3., - rho=0.50, + _pivot.extend(test_approx_pivot(n=100, + p=400, + signal_fac=1., + s=0, + sigma=1., + rho=0.30, randomizer_scale=0.7)) print("iteration completed ", i) @@ -172,4 +175,4 @@ def main(nsim=300, CI=False): if __name__ == "__main__": - main(nsim=50, CI=True) \ No newline at end of file + main(nsim=50, CI=False) \ No newline at end of file diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index e38bde4fa..17ecb0423 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -204,6 +204,77 @@ def test_instance(): # cover.extend(cover_) # print(np.mean(cover), 'coverage so far ') + +def test_selected_targets_disperse(n=500, + p=100, + signal_fac=1., + s=5, + sigma=1., + rho=0.4, + randomizer_scale=1, + full_dispersion=True): + """ + Compare to R randomized lasso + """ + + inst, const = gaussian_instance, lasso.gaussian + signal = 1. + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + idx = np.arange(p) + sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) + print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + dispersion = None + if full_dispersion: + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals + + def main(nsim=500, full=False): P0, PA, cover, length_int = [], [], [], [] from statsmodels.distributions import ECDF @@ -220,17 +291,17 @@ def main(nsim=500, full=False): avg_length = intervals[:, 1] - intervals[:, 0] else: full_dispersion = True - p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, + p0, pA, cover_, intervals = test_selected_targets_disperse(n=n, p=p, s=int(p/2), full_dispersion=full_dispersion) avg_length = intervals[:, 1] - intervals[:, 0] cover.extend(cover_) P0.extend(p0) PA.extend(pA) - print( - np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), - np.mean(avg_length), 'null pvalue + power + length') - + # print( + # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), + # np.mean(avg_length), 'null pvalue + power + length') + print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) if __name__ == "__main__": main(nsim=100) From 8cd202bff7dffac9b829abdc5496c87056d0bd2a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sat, 15 May 2021 22:41:57 -0400 Subject: [PATCH 087/187] commit changes --- selectinf/randomized/lasso.py | 2 +- .../randomized/tests/test_approx_reference.py | 57 +++++++++++++------ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index f06e837eb..12d153ddb 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -321,7 +321,7 @@ def gaussian(X, ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1) if randomizer_scale is None: - randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.)) + randomizer_scale = np.sqrt(mean_diag) * 0.7 * np.std(Y) * np.sqrt(n / (n - 1.)) randomizer = randomization.isotropic_gaussian((p,), randomizer_scale) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 1832b7cbe..4a9276741 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -70,36 +70,54 @@ def test_approx_pivot(n=500, p=p, signal=signal, s=s, - equicorrelated=False, + equicorrelated=True, rho=rho, sigma=sigma, - random_signs=True)[:3] + random_signs=False)[:3] n, p = X.shape sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + if n>p: + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = sigma_ ** 2 - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + #W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + eps = np.random.standard_normal((n, 2000)) * Y.std() + lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) + W = lam_theory * np.ones(p) conv = const(X, Y, W, - randomizer_scale=randomizer_scale * dispersion) + ridge_term = 0.) + #randomizer_scale=randomizer_scale * dispersion) signs = conv.fit() nonzero = signs != 0 + print("number of selected ", nonzero.sum()) if nonzero.sum()>0: - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + if n>p: + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + else: + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=sigma ** 2) inverse_info = conv.selective_MLE(observed_target, cov_target, @@ -191,17 +209,20 @@ def test_approx_ci(n=500, def main(nsim=300, CI = False): + import matplotlib as mpl + mpl.use('tkagg') import matplotlib.pyplot as plt from statsmodels.distributions.empirical_distribution import ECDF + if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=200, - p=100, + _pivot.extend(test_approx_pivot(n=100, + p=400, signal_fac=1., - s=5, - sigma=3., - rho=0.20, + s=10, + sigma=5., + rho=0.30, randomizer_scale=1.)) print("iteration completed ", i) From 18033ae85a5679791115b433b3a04711f9a6589e Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 18 May 2021 10:35:54 -0400 Subject: [PATCH 088/187] test to compare unbiased estimates --- selectinf/randomized/lasso.py | 2 +- .../tests/test_unbiased_estimates.py | 150 ++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 selectinf/randomized/tests/test_unbiased_estimates.py diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 12d153ddb..ca7d133db 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -210,7 +210,7 @@ def signed_basis_vector(p, j, s): unpenalized_directions) opt_offset = self.initial_subgrad - + self.opt_linear = opt_linear # now make the constraints and implied gaussian self._setup = True diff --git a/selectinf/randomized/tests/test_unbiased_estimates.py b/selectinf/randomized/tests/test_unbiased_estimates.py new file mode 100644 index 000000000..1a9918e72 --- /dev/null +++ b/selectinf/randomized/tests/test_unbiased_estimates.py @@ -0,0 +1,150 @@ +import numpy as np + +from ..lasso import lasso, selected_targets +from ...tests.instance import gaussian_instance + +def UMVU(query, + X, + Y, + nonzero, + feat, + dispersion): + + n, p = X.shape + + nopt = nonzero.sum() + + _, randomizer_prec = query.randomizer.cov_prec + + implied_precision = np.zeros((n + nopt, n + nopt)) + + implied_precision[:n][:, :n] = (1. / dispersion) * (np.identity(n)) + (X.dot(X.T) * randomizer_prec) + + implied_precision[n:][:, :n] = -query.opt_linear.T.dot(X.T) * randomizer_prec + + implied_precision[:n][:, n:] = implied_precision[n:][:, :n].T + + implied_precision[n:][:, n:] = query.opt_linear.T.dot(query.opt_linear) * randomizer_prec + + implied_cov = np.linalg.inv(implied_precision) + + _prec = np.linalg.inv(implied_cov[:n][:, :n]) + + linear_coef = (np.linalg.pinv(X[:, feat]).dot(_prec)) + offset = -np.linalg.pinv(X[:, feat]).dot(X.dot(query.initial_subgrad) + - _prec.dot(implied_cov[:n][:, n:]).dot(query.opt_linear.T.dot(query.initial_subgrad))) * (randomizer_prec) + + linear_coef *= dispersion + offset *= dispersion + UMVU = linear_coef.dot(Y) + offset + + return UMVU + +def EST(query, + X, + Y, + nonzero, + feat, + dispersion): + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(query.loglike, + query._W, + feat, + dispersion=dispersion) + + _, randomizer_prec = query.randomizer.cov_prec + cond_cov = query.cond_cov + logdens_linear = query.sampler.logdens_transform[0] + cond_mean = query.cond_mean + + prec_target = np.linalg.inv(cov_target) + prec_opt = np.linalg.inv(cond_cov) + + target_linear = cov_target_score.T.dot(prec_target) + target_offset = (-X.T.dot(Y) + query.initial_subgrad) - target_linear.dot(observed_target) + + target_lin = - logdens_linear.dot(target_linear) + target_off = cond_mean - target_lin.dot(observed_target) + + _prec = prec_target + (target_linear.T.dot(target_linear) * randomizer_prec) - target_lin.T.dot( + prec_opt).dot(target_lin) + _P = target_linear.T.dot(target_offset) * randomizer_prec + + linear_coef = cov_target.dot(_prec) + offset = cov_target.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) + est = linear_coef.dot(observed_target) + offset + + return est + +def test_UMVU(n=500, + p=100, + signal_fac=1., + s=5, + sigma=3., + rho=0.7, + randomizer_scale=np.sqrt(0.5)): + + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=True, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + sigma_ = np.std(Y) + W = 0.8 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + #ridge_term=0., + randomizer_scale=randomizer_scale * sigma) + + signs = conv.fit() + nonzero = signs != 0 + + if nonzero.sum() > 0: + #dispersion = sigma ** 2 + if p > n/2: + dispersion = np.std(Y) ** 2 + else: + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + feat = nonzero.copy() + feat[-5:] = 1 + dispersion = np.linalg.norm(Y - X[:, feat].dot(np.linalg.pinv(X[:, feat]).dot(Y))) ** 2 / (n - feat.sum()) + + umvu = UMVU(conv, + X, + Y, + nonzero, + feat, + dispersion) + + est = EST(conv, + X, + Y, + nonzero, + feat, + dispersion) + + print("check ", np.allclose(est, umvu, atol=1e-04), umvu, est) + + return umvu, est + +def main(): + + test_UMVU(n=400, p=100, s=5) + +if __name__ == "__main__": + main() From b133174867104102d681aa5671dc97ff89abd8fc Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 24 May 2021 09:22:42 -0400 Subject: [PATCH 089/187] updates to test for unbiased est --- selectinf/randomized/tests/test_unbiased_estimates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_unbiased_estimates.py b/selectinf/randomized/tests/test_unbiased_estimates.py index 1a9918e72..eb8beac0d 100644 --- a/selectinf/randomized/tests/test_unbiased_estimates.py +++ b/selectinf/randomized/tests/test_unbiased_estimates.py @@ -138,13 +138,13 @@ def test_UMVU(n=500, feat, dispersion) - print("check ", np.allclose(est, umvu, atol=1e-04), umvu, est) + print("check ", np.allclose(est-umvu, np.zeros(est.shape[0]), atol=1e-03), est-umvu) return umvu, est def main(): - test_UMVU(n=400, p=100, s=5) + test_UMVU(n=100, p=400, s=5) if __name__ == "__main__": main() From 93d1a67d07f1f9903fe7b9cbfb0140654cc22024 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 24 May 2021 09:37:46 -0400 Subject: [PATCH 090/187] commit changes before switch --- selectinf/randomized/approx_reference.py | 10 +- selectinf/randomized/lasso.py | 2 +- .../randomized/tests/test_approx_reference.py | 139 +++++++++--------- 3 files changed, 76 insertions(+), 75 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index af8b936c8..0041cccb7 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -64,9 +64,7 @@ def __init__(self, self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) - ngrid = 40 - - scale_ = 4 * np.max(np.sqrt(np.diag(inverse_info))) + ngrid = 60 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): @@ -181,13 +179,14 @@ def _construct_families(self): grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m])**2 / var_target) + 0.5 * (grid - self.observed_target[m])**2 / var_target) logW -= logW.max() + weights = np.exp(logW) # construction of families follows `selectinf.learning.core` self._families.append(discrete_family(grid, - np.exp(logW))) + weights)) # logG = - 0.5 * grid**2 / var_target # logG -= logG.max() @@ -223,6 +222,7 @@ def _approx_pivots(self, _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, x=observed_target) + #_cdf = family.cdf(mean_parameter[m]/var_target, x=observed_target) if alternatives[m] == 'twosided': pivot.append(2 * min(_cdf, 1 - _cdf)) elif alternatives[m] == 'greater': diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index ca7d133db..477b5b75c 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -321,7 +321,7 @@ def gaussian(X, ridge_term = np.std(Y) * np.sqrt(mean_diag) / np.sqrt(n - 1) if randomizer_scale is None: - randomizer_scale = np.sqrt(mean_diag) * 0.7 * np.std(Y) * np.sqrt(n / (n - 1.)) + randomizer_scale = np.sqrt(mean_diag) * 0.5 * np.std(Y) * np.sqrt(n / (n - 1.)) randomizer = randomization.isotropic_gaussian((p,), randomizer_scale) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 4a9276741..aaf2544c4 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -63,75 +63,76 @@ def test_approx_pivot(n=500, rho=0.4, randomizer_scale=1.): - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=True, - rho=rho, - sigma=sigma, - random_signs=False)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - if n>p: - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - else: - dispersion = sigma_ ** 2 - - #W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - eps = np.random.standard_normal((n, 2000)) * Y.std() - lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) - W = lam_theory * np.ones(p) - - conv = const(X, - Y, - W, - ridge_term = 0.) - #randomizer_scale=randomizer_scale * dispersion) + while True: - signs = conv.fit() - nonzero = signs != 0 - print("number of selected ", nonzero.sum()) + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) - if nonzero.sum()>0: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=True, + rho=rho, + sigma=sigma, + random_signs=True)[:3] - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - if n>p: - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) + n, p = X.shape + sigma_ = np.std(Y) + if n > p: + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) else: - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=sigma ** 2) - - inverse_info = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[1] - - approximate_grid_inf = approximate_grid_inference(conv, - observed_target, - cov_target, - cov_target_score) - - pivot = approximate_grid_inf._approx_pivots(beta_target) - - return pivot - + dispersion = sigma_ ** 2 + + # W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + eps = np.random.standard_normal((n, 2000)) * Y.std() + lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) + W = lam_theory * np.ones(p) + + conv = const(X, + Y, + W, + ridge_term=0.) + # randomizer_scale=randomizer_scale * dispersion) + + signs = conv.fit() + nonzero = signs != 0 + print("number of selected ", nonzero.sum()) + + if nonzero.sum() > 0: + + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + if n > p: + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + else: + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=sigma ** 2) + + inverse_info = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[1] + + approximate_grid_inf = approximate_grid_inference(conv, + observed_target, + cov_target, + cov_target_score) + + pivot = approximate_grid_inf._approx_pivots(beta_target) + + return pivot def test_approx_ci(n=500, p=100, @@ -219,9 +220,9 @@ def main(nsim=300, CI = False): for i in range(nsim): _pivot.extend(test_approx_pivot(n=100, p=400, - signal_fac=1., - s=10, - sigma=5., + signal_fac=0.5, + s=0, + sigma=1., rho=0.30, randomizer_scale=1.)) @@ -243,7 +244,7 @@ def main(nsim=300, CI = False): signal_fac=1., s=5, sigma=3., - rho=0.4, + rho=0.3, randomizer_scale=1.) coverage_ += cov @@ -253,4 +254,4 @@ def main(nsim=300, CI = False): print("iteration completed ", n + 1) if __name__ == "__main__": - main(nsim=40, CI = False) + main(nsim=50, CI = False) From a4c47ba9791b49473a28b7a955eaefdef58aedf5 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 7 Jun 2021 12:18:31 -0400 Subject: [PATCH 091/187] MLE code updated --- selectinf/randomized/query.py | 807 ++++++------------ .../tests/test_selective_MLE_high.py | 56 +- 2 files changed, 305 insertions(+), 558 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index aa1cbd8a6..05afbcd8e 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -16,42 +16,30 @@ from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from .approx_reference import approximate_grid_inference -class query(object): +class query(object): r""" This class is the base of randomized selective inference based on convex programs. - The main mechanism is to take an initial penalized program - .. math:: - \text{minimize}_B \ell(B) + {\cal P}(B) - and add a randomization and small ridge term yielding - .. math:: - - \text{minimize}_B \ell(B) + {\cal P}(B) - + \text{minimize}_B \ell(B) + {\cal P}(B) - \langle \omega, B \rangle + \frac{\epsilon}{2} \|B\|^2_2 - """ def __init__(self, randomization, perturb=None): """ - Parameters ---------- - randomization : `selection.randomized.randomization.randomization` - Instance of a randomization scheme. + Instance of a randomization scheme. Describes the law of $\omega$. - perturb : ndarray, optional Value of randomization vector, an instance of $\omega$. - - """ self.randomization = randomization self.perturb = perturb @@ -64,21 +52,17 @@ def __init__(self, randomization, perturb=None): def randomize(self, perturb=None): """ - The actual randomization step. - Parameters ---------- - perturb : ndarray, optional Value of randomization vector, an instance of $\omega$. - """ if not self._randomized: - (self.randomized_loss, - self._initial_omega) = self.randomization.randomize(self.loss, - self.epsilon, + (self.randomized_loss, + self._initial_omega) = self.randomization.randomize(self.loss, + self.epsilon, perturb=perturb) self._randomized = True @@ -97,8 +81,8 @@ def solve(self): raise NotImplementedError('abstract method') -class gaussian_query(query): +class gaussian_query(query): useC = True """ @@ -118,7 +102,7 @@ def fit(self, perturb=None): # Private methods - def _setup_sampler(self, + def _setup_sampler(self, linear_part, offset, opt_linear, @@ -131,10 +115,10 @@ def _setup_sampler(self, if not np.all(A.dot(self.observed_opt_state) - b <= 0): raise ValueError('constraints not satisfied') - (cond_mean, - cond_cov, - cond_precision, - logdens_linear) = self._setup_implied_gaussian(opt_linear, + (cond_mean, + cond_cov, + cond_precision, + logdens_linear) = self._setup_implied_gaussian(opt_linear, opt_offset, dispersion) @@ -146,12 +130,13 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): arg = opt + mean_term return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - log_density = functools.partial(log_density, - logdens_linear, - opt_offset, + log_density = functools.partial(log_density, + logdens_linear, + opt_offset, cond_precision) - self.cond_mean, self.cond_cov = cond_mean, cond_cov + _, randomizer_prec = self.randomizer.cov_prec + self.cond_mean, self.cond_cov, self.randomizer_prec = cond_mean, cond_cov, randomizer_prec affine_con = constraints(A, b, @@ -163,17 +148,18 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): self.observed_score_state, log_density, (logdens_linear, opt_offset), + self.randomizer_prec, selection_info=self.selection_variable, useC=self.useC) - def _setup_implied_gaussian(self, - opt_linear, + def _setup_implied_gaussian(self, + opt_linear, opt_offset, # optional dispersion parameter # for covariance of randomization dispersion=1): - _, prec = self.randomizer.cov_prec + _, prec = self.randomizer.cov_prec prec = prec / dispersion if np.asarray(prec).shape in [(), (0,)]: @@ -190,9 +176,9 @@ def _setup_implied_gaussian(self, return cond_mean, cond_cov, cond_precision, logdens_linear def summary(self, - observed_target, - target_cov, - target_score_cov, + observed_target, + target_cov, + target_score_cov, alternatives, opt_sample=None, target_sample=None, @@ -204,38 +190,27 @@ def summary(self, """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - parameter : np.array Hypothesized value for parameter -- defaults to 0. - level : float Confidence level. - ndraw : int (optional) Defaults to 1000. - burnin : int (optional) Defaults to 1000. - compute_intervals : bool Compute confidence intervals? - dispersion : float (optional) Use a known value for dispersion, or Pearson's X^2? """ @@ -246,7 +221,7 @@ def summary(self, if opt_sample is None: opt_sample, logW = self.sampler.sample(ndraw, burnin) else: - if len(opt_sample) == 1: # only a sample, so weights are 1s + if len(opt_sample) == 1: # only a sample, so weights are 1s opt_sample = opt_sample[0] logW = np.zeros(ndraw) else: @@ -272,17 +247,16 @@ def summary(self, else: pvalues = pivots - result = pd.DataFrame({'target':observed_target, - 'pvalue':pvalues}) + result = pd.DataFrame({'target': observed_target, + 'pvalue': pvalues}) if compute_intervals: - MLE = self.selective_MLE(observed_target, target_cov, target_score_cov)[0] MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) - intervals = self.sampler.confidence_intervals( + intervals = self.sampler.confidence_intervals( observed_target, target_cov, target_score_cov, @@ -291,8 +265,8 @@ def summary(self, initial_guess=MLE_intervals, level=level) - result.insert(2, 'lower_confidence', intervals[:,0]) - result.insert(3, 'upper_confidence', intervals[:,1]) + result.insert(2, 'lower_confidence', intervals[:, 0]) + result.insert(3, 'upper_confidence', intervals[:, 1]) if not np.all(parameter == 0): result.insert(4, 'pivot', pivots) @@ -301,33 +275,26 @@ def summary(self, return result def selective_MLE(self, - observed_target, - target_cov, - target_score_cov, + observed_target, + target_cov, + target_score_cov, level=0.9, - solve_args={'tol':1.e-12}): + solve_args={'tol': 1.e-12}): """ - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - level : float, optional Confidence level. - solve_args : dict, optional Arguments passed to solver. - """ - + return self.sampler.selective_MLE(observed_target, target_cov, target_score_cov, @@ -336,50 +303,43 @@ def selective_MLE(self, solve_args=solve_args) def posterior(self, - observed_target, - target_cov, - target_score_cov, + observed_target, + target_cov, + target_score_cov, prior=None, dispersion=None, - solve_args={'tol':1.e-12}): + solve_args={'tol': 1.e-12}): """ - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - prior : callable A callable object that takes a single argument `parameter` of the same shape as `observed_target` and returns (value of log prior, gradient of log prior) - dispersion : float, optional Dispersion parameter for log-likelihood. - solve_args : dict, optional Arguments passed to solver. - """ - + if dispersion is None: dispersion = 1 print('Using dispersion parameter 1...') - + if prior is None: Di = 1. / (200 * np.diag(target_cov)) + def prior(target_parameter): grad_prior = -target_parameter * Di - log_prior = -0.5 * np.sum(target_parameter**2 * Di) + log_prior = -0.5 * np.sum(target_parameter ** 2 * Di) return log_prior, grad_prior - + return posterior(self, observed_target, target_cov, @@ -396,26 +356,19 @@ def approximate_grid_inference(self, solve_args={'tol': 1.e-12}): """ - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - solve_args : dict, optional Arguments passed to solver. - """ G = approximate_grid_inference(self, @@ -425,8 +378,8 @@ def approximate_grid_inference(self, solve_args=solve_args) return G.summary(alternatives=alternatives) -class multiple_queries(object): +class multiple_queries(object): ''' Combine several queries of a given data through randomized algorithms. @@ -434,16 +387,12 @@ class multiple_queries(object): def __init__(self, objectives): ''' - Parameters ---------- - objectives : sequence A sequences of randomized objective functions. - Notes ----- - Each element of `objectives` must have a `setup_sampler` method that returns a description of the distribution of the @@ -454,10 +403,8 @@ def __init__(self, objectives): `form_covariances` to linearly decompose each score in terms of a target and an asymptotically independent piece. - Returns ------- - None ''' @@ -470,9 +417,9 @@ def fit(self): def summary(self, observed_target, - opt_sampling_info, # a sequence of (target_cov, score_cov) - # objects in theory all target_cov - # should be about the same... + opt_sampling_info, # a sequence of (target_cov, score_cov) + # objects in theory all target_cov + # should be about the same... alternatives=None, parameter=None, level=0.9, @@ -483,32 +430,23 @@ def summary(self, """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - observed_target : ndarray Observed estimate of target. - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - parameter : np.array Hypothesized value for parameter -- defaults to 0. - level : float Confidence level. - ndraw : int (optional) Defaults to 1000. - burnin : int (optional) Defaults to 1000. - compute_intervals : bool Compute confidence intervals? - """ if parameter is None: @@ -525,10 +463,10 @@ def summary(self, if opt_sampling_info[i][0] is None or opt_sampling_info[i][1] is None: raise ValueError("did not input target and score covariance info") opt_sample, opt_logW = self.objectives[i].sampler.sample(ndraw, burnin) - self.opt_sampling_info.append((self.objectives[i].sampler, - opt_sample, + self.opt_sampling_info.append((self.objectives[i].sampler, + opt_sample, opt_logW, - opt_sampling_info[i][0], + opt_sampling_info[i][0], opt_sampling_info[i][1])) pivots = self.coefficient_pvalues(observed_target, @@ -547,10 +485,10 @@ def summary(self, intervals = self.confidence_intervals(observed_target, level) - result = pd.DataFrame({'target':observed_target, - 'pvalue':pvalues, - 'lower_confidence':intervals[:,0], - 'upper_confidence':intervals[:,1]}) + result = pd.DataFrame({'target': observed_target, + 'pvalue': pvalues, + 'lower_confidence': intervals[:, 0], + 'upper_confidence': intervals[:, 1]}) if not np.all(parameter == 0): result.insert(4, 'pivot', pivots) @@ -567,30 +505,23 @@ def coefficient_pvalues(self, ''' Construct selective p-values for each parameter of the target. - Parameters ---------- - observed_target : ndarray Observed estimate of target. - parameter : ndarray (optional) A vector of parameters with shape `self.shape` at which to evaluate p-values. Defaults to `np.zeros(self.shape)`. - sample_args : sequence Arguments to `self.sample` if sample is not found for a given objective. - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - Returns ------- pvalues : ndarray - ''' for i in range(len(self.objectives)): @@ -599,10 +530,10 @@ def coefficient_pvalues(self, self.opt_sampling_info[i][1] = _sample self.opt_sampling_info[i][2] = _logW - ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective + ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective - _intervals = optimization_intervals(self.opt_sampling_info, - observed_target, + _intervals = optimization_intervals(self.opt_sampling_info, + observed_target, ndraw) pvals = [] @@ -614,7 +545,6 @@ def coefficient_pvalues(self, return np.array(pvals) - def confidence_intervals(self, observed_target, sample_args=(), @@ -623,25 +553,19 @@ def confidence_intervals(self, ''' Construct selective confidence intervals for each parameter of the target. - Parameters ---------- - observed_target : ndarray Observed estimate of target. - sample_args : sequence Arguments to `self.sample` if sample is not found for a given objective. - level : float Confidence level. - Returns ------- limits : ndarray Confidence intervals for each target. - ''' for i in range(len(self.objectives)): @@ -650,10 +574,10 @@ def confidence_intervals(self, self.opt_sampling_info[i][1] = _sample self.opt_sampling_info[i][2] = _logW - ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective + ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective - _intervals = optimization_intervals(self.opt_sampling_info, - observed_target, + _intervals = optimization_intervals(self.opt_sampling_info, + observed_target, ndraw) limits = [] @@ -663,7 +587,7 @@ def confidence_intervals(self, keep[i] = 1. limits.append(_intervals.confidence_interval(keep, level=level)) - return np.array(limits) + return np.array(limits) class optimization_sampler(object): @@ -698,36 +622,27 @@ def hypothesis_test(self, using sampler with gradient map `self.gradient` and projection map `self.projection`. - Parameters ---------- - test_stat : callable Test statistic to evaluate on sample from selective distribution. - observed_value : float Observed value of test statistic. Used in p-value calculation. - sample_args : sequence Arguments to `self.sample` if sample is None. - sample : np.array (optional) If not None, assumed to be a sample of shape (-1,) + `self.shape` representing a sample of the target from parameters. Allows reuse of the same sample for construction of confidence intervals, hypothesis tests, etc. If not None, `ndraw, burnin, stepsize` are ignored. - parameter : np.float (optional) - alternative : ['greater', 'less', 'twosided'] What alternative to use. - Returns ------- - pvalue : float ''' @@ -767,59 +682,50 @@ def confidence_intervals(self, level=0.9, initial_guess=None): ''' - Parameters ---------- - + observed : np.float A vector of parameters with shape `self.shape`, representing coordinates of the target. - sample_args : sequence Arguments to `self.sample` if sample is None. - sample : np.array (optional) If not None, assumed to be a sample of shape (-1,) + `self.shape` representing a sample of the target from parameters `self.reference`. Allows reuse of the same sample for construction of confidence intervals, hypothesis tests, etc. - level : float (optional) Specify the confidence level. - initial_guess : np.float Initial guesses at upper and lower limits, optional. - Notes ----- - Construct selective confidence intervals for each parameter of the target. - Returns ------- - intervals : [(float, float)] List of confidence intervals. ''' if sample is None: sample, logW = self.sample(*sample_args) - sample = np.vstack([sample]*5) # why times 5? - logW = np.hstack([logW]*5) + sample = np.vstack([sample] * 5) # why times 5? + logW = np.hstack([logW] * 5) else: sample, logW = sample ndraw = sample.shape[0] - _intervals = optimization_intervals([(self, - sample, + _intervals = optimization_intervals([(self, + sample, logW, - target_cov, + target_cov, score_cov)], - observed_target, - ndraw, + observed_target, + ndraw, normal_sample=normal_sample) limits = [] @@ -848,35 +754,27 @@ def coefficient_pvalues(self, ''' Construct selective p-values for each parameter of the target. - Parameters ---------- - observed : np.float A vector of parameters with shape `self.shape`, representing coordinates of the target. - parameter : np.float (optional) A vector of parameters with shape `self.shape` at which to evaluate p-values. Defaults to `np.zeros(self.shape)`. - sample_args : sequence Arguments to `self.sample` if sample is None. - sample : np.array (optional) If not None, assumed to be a sample of shape (-1,) + `self.shape` representing a sample of the target from parameters `self.reference`. Allows reuse of the same sample for construction of confidence intervals, hypothesis tests, etc. - alternatives : list of ['greater', 'less', 'twosided'] What alternative to use. - Returns ------- pvalues : np.float - ''' if alternatives is None: @@ -891,21 +789,21 @@ def coefficient_pvalues(self, if parameter is None: parameter = np.zeros(observed_target.shape[0]) - _intervals = optimization_intervals([(self, - sample, + _intervals = optimization_intervals([(self, + sample, logW, - target_cov, + target_cov, score_cov)], - observed_target, - ndraw, + observed_target, + ndraw, normal_sample=normal_sample) pvals = [] for i in range(observed_target.shape[0]): keep = np.zeros_like(observed_target) keep[i] = 1. - pvals.append(_intervals.pivot(keep, - candidate=parameter[i], + pvals.append(_intervals.pivot(keep, + candidate=parameter[i], alternative=alternatives[i])) return np.array(pvals) @@ -916,14 +814,14 @@ def _reconstruct_score_from_target(self, if transform is not None: direction, nuisance = transform score_sample = (np.multiply.outer(target_sample, - direction) + + direction) + nuisance[None, :]) else: score_sample = target_sample return score_sample -class affine_gaussian_sampler(optimization_sampler): +class affine_gaussian_sampler(optimization_sampler): ''' Sample from an affine truncated Gaussian ''' @@ -933,42 +831,36 @@ def __init__(self, initial_point, observed_score_state, log_cond_density, - logdens_transform, # described how score enters log_density. + logdens_transform, # described how score enters log_density. + randomizer_prec, selection_info=None, useC=False): ''' Parameters ---------- - affine_con : `selection.constraints.affine.constraints` Affine constraints - initial_point : ndarray Feasible point for affine constraints. - observed_score_state : ndarray Observed score of convex loss (slightly modified). - Essentially (asymptotically) equivalent - to $\nabla \ell(\beta^*) + + Essentially (asymptotically) equivalent + to $\nabla \ell(\beta^*) + Q(\beta^*)\beta^*$ where $\beta^*$ is population minimizer. For linear regression, it is always $-X^Ty$. - log_cond_density : callable Density of optimization variables given score - logdens_transform : tuple Description of how conditional mean of optimization variables depends on score. - selection_info : optional Function of optimization variables that will be conditioned on. - useC : bool, optional Use python or C solver. - + ''' self.affine_con = affine_con @@ -982,6 +874,7 @@ def __init__(self, self._log_cond_density = log_cond_density self.logdens_transform = logdens_transform self.useC = useC + self.randomizer_prec = randomizer_prec def log_cond_density(self, opt_sample, @@ -990,9 +883,9 @@ def log_cond_density(self, if transform is not None: direction, nuisance = transform - return self._log_density_ray(0, # candidate - # has been added to - # target + return self._log_density_ray(0, # candidate + # has been added to + # target direction, nuisance, target_sample, @@ -1012,16 +905,12 @@ def sample(self, ndraw, burnin): using projected Langevin sampler with gradient map `self.gradient` and projection map `self.projection`. - Parameters ---------- - ndraw : int How long a chain to return? - burnin : int How many samples to discard? - ''' _sample = sample_from_constraints(self.affine_con, @@ -1030,63 +919,58 @@ def sample(self, ndraw, burnin): burnin=burnin) return _sample, np.zeros(_sample.shape[0]) - def selective_MLE(self, - observed_target, - target_cov, - target_score_cov, - # initial (observed) value of optimization variables -- + def selective_MLE(self, + observed_target, + target_cov, + target_score_cov, + # initial (observed) value of optimization variables -- # used as a feasible point. - # precise value used only for independent estimator - init_soln, - solve_args={'tol':1.e-12}, + # precise value used only for independent estimator + init_soln, + solve_args={'tol': 1.e-12}, level=0.9): """ Selective MLE based on approximation of CGF. - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - init_soln : ndarray Feasible point for optimization problem. - level : float, optional Confidence level. - solve_args : dict, optional Arguments passed to solver. - """ + score_offset = self.observed_score_state + self.logdens_transform[1] - return selective_MLE(observed_target, - target_cov, - target_score_cov, - init_soln, + return selective_MLE(observed_target, + target_cov, + target_score_cov, + init_soln, self.mean, self.covariance, self.logdens_transform[0], self.affine_con.linear_part, self.affine_con.offset, + self.randomizer_prec, + score_offset, solve_args=solve_args, level=level, useC=self.useC) - def reparam_map(self, - parameter_target, - observed_target, - target_cov, - target_score_cov, - init_soln, - solve_args={'tol':1.e-12}, + def reparam_map(self, + parameter_target, + observed_target, + target_cov, + target_score_cov, + init_soln, + solve_args={'tol': 1.e-12}, useC=True): prec_target = np.linalg.inv(target_cov) @@ -1107,19 +991,23 @@ def reparam_map(self, solver = _solve_barrier_affine_py val, soln, hess = solver(conjugate_arg, - prec_opt, # JT: I think this quadratic is wrong should involve target_cov and target_lin too? + prec_opt, + # JT: I think this quadratic is wrong should involve target_cov and target_lin too? init_soln, self.affine_con.linear_part, self.affine_con.offset, **solve_args) - + inter_map = target_cov.dot(target_lin.T.dot(prec_opt)) param_map = parameter_target + inter_map.dot(mean_param - soln) - log_normalizer_map = ((parameter_target.T.dot(prec_target + target_lin.T.dot(prec_opt).dot(target_lin)).dot(parameter_target))/2. - - parameter_target.T.dot(target_lin.T).dot(prec_opt.dot(soln)) - target_offset.T.dot(prec_opt).dot(target_offset)/2. - + val - (param_map.T.dot(prec_target).dot(param_map))/2.) + log_normalizer_map = ((parameter_target.T.dot(prec_target + target_lin.T.dot(prec_opt).dot(target_lin)).dot( + parameter_target)) / 2. + - parameter_target.T.dot(target_lin.T).dot(prec_opt.dot(soln)) - target_offset.T.dot( + prec_opt).dot(target_offset) / 2. + + val - (param_map.T.dot(prec_target).dot(param_map)) / 2.) - jacobian_map = (np.identity(ndim) + inter_map.dot(target_lin)) - inter_map.dot(hess).dot(prec_opt.dot(target_lin)) + jacobian_map = (np.identity(ndim) + inter_map.dot(target_lin)) - inter_map.dot(hess).dot( + prec_opt.dot(target_lin)) return param_map, log_normalizer_map, jacobian_map @@ -1132,24 +1020,24 @@ def _log_density_ray(self, # implicitly caching (opt_sample, gaussian_sample) ? - if (not hasattr(self, "_direction") or not - np.all(self._direction == direction)): + if (not hasattr(self, "_direction") or not + np.all(self._direction == direction)): logdens_lin, logdens_offset = self.logdens_transform if opt_sample.shape[1] == 1: prec = 1. / self.covariance[0, 0] - quadratic_term = logdens_lin.dot(direction)**2 * prec - arg = (logdens_lin.dot(nuisance + logdens_offset) + + quadratic_term = logdens_lin.dot(direction) ** 2 * prec + arg = (logdens_lin.dot(nuisance + logdens_offset) + logdens_lin.dot(direction) * gaussian_sample + - opt_sample[:,0]) + opt_sample[:, 0]) linear_term = logdens_lin.dot(direction) * prec * arg - constant_term = arg**2 * prec + constant_term = arg ** 2 * prec - self._cache = {'linear_term':linear_term, - 'quadratic_term':quadratic_term, - 'constant_term':constant_term} + self._cache = {'linear_term': linear_term, + 'quadratic_term': quadratic_term, + 'constant_term': constant_term} else: self._direction = direction.copy() @@ -1169,82 +1057,83 @@ def _log_density_ray(self, logdens_lin, logdens_offset = self.logdens_transform cov = self.covariance prec = np.linalg.inv(cov) - linear_part = logdens_lin.dot(direction) # A gamma + linear_part = logdens_lin.dot(direction) # A gamma if 1 in opt_sample.shape: - pass # stop3 what's this for? + pass # stop3 what's this for? cov = self.covariance quadratic_term = linear_part.T.dot(prec).dot(linear_part) arg1 = opt_sample.T - arg2 = logdens_lin.dot(np.multiply.outer(direction, gaussian_sample) + - (nuisance + logdens_offset)[:,None]) + arg2 = logdens_lin.dot(np.multiply.outer(direction, gaussian_sample) + + (nuisance + logdens_offset)[:, None]) arg = arg1 + arg2 linear_term = linear_part.T.dot(prec).dot(arg) constant_term = np.sum(prec.dot(arg) * arg, 0) - self._cache = {'linear_term':linear_term, - 'quadratic_term':quadratic_term, - 'constant_term':constant_term} - (linear_term, + self._cache = {'linear_term': linear_term, + 'quadratic_term': quadratic_term, + 'constant_term': constant_term} + (linear_term, quadratic_term, - constant_term) = (self._cache['linear_term'], + constant_term) = (self._cache['linear_term'], self._cache['quadratic_term'], self._cache['constant_term']) - return (-0.5 * candidate**2 * quadratic_term - - candidate * linear_term - 0.5 * constant_term) + return (-0.5 * candidate ** 2 * quadratic_term - + candidate * linear_term - 0.5 * constant_term) + class optimization_intervals(object): def __init__(self, - opt_sampling_info, # a sequence of - # (opt_sampler, - # opt_sample, - # opt_logweights, - # target_cov, - # score_cov) objects - # in theory all target_cov - # should be about the same... + opt_sampling_info, # a sequence of + # (opt_sampler, + # opt_sample, + # opt_logweights, + # target_cov, + # score_cov) objects + # in theory all target_cov + # should be about the same... observed, - nsample, # how large a normal sample + nsample, # how large a normal sample target_cov=None, normal_sample=None): - # not all opt_samples will be of the same size as nsample + # not all opt_samples will be of the same size as nsample # let's repeat them as necessary - + tiled_sampling_info = [] - for (opt_sampler, - opt_sample, + for (opt_sampler, + opt_sample, opt_logW, - t_cov, - t_score_cov) in opt_sampling_info: + t_cov, + t_score_cov) in opt_sampling_info: if opt_sample is not None: if opt_sample.shape[0] < nsample: if opt_sample.ndim == 1: - tiled_opt_sample = np.tile(opt_sample, - int(np.ceil(nsample / - opt_sample.shape[0])))[:nsample] + tiled_opt_sample = np.tile(opt_sample, + int(np.ceil(nsample / + opt_sample.shape[0])))[:nsample] tiled_opt_logW = np.tile(opt_logW, - int(np.ceil(nsample / + int(np.ceil(nsample / opt_logW.shape[0])))[:nsample] else: - tiled_opt_sample = np.tile(opt_sample, - (int(np.ceil(nsample / - opt_sample.shape[0])), 1))[:nsample] + tiled_opt_sample = np.tile(opt_sample, + (int(np.ceil(nsample / + opt_sample.shape[0])), 1))[:nsample] tiled_opt_logW = np.tile(opt_logW, - (int(np.ceil(nsample / + (int(np.ceil(nsample / opt_logW.shape[0])), 1))[:nsample] else: tiled_opt_sample = opt_sample[:nsample] tiled_opt_logW = opt_logW[:nsample] else: tiled_sample = None - tiled_sampling_info.append((opt_sampler, - tiled_opt_sample, + tiled_sampling_info.append((opt_sampler, + tiled_opt_sample, tiled_opt_logW, - t_cov, + t_cov, t_score_cov)) self.opt_sampling_info = tiled_sampling_info @@ -1252,14 +1141,14 @@ def __init__(self, for opt_sampler, opt_sample, opt_logW, _, _ in opt_sampling_info: self._logden += opt_sampler.log_cond_density( - opt_sample, - opt_sampler.observed_score_state, - transform=None) + opt_sample, + opt_sampler.observed_score_state, + transform=None) self._logden -= opt_logW if opt_sample.shape[0] < nsample: - self._logden = np.tile(self._logden, - int(np.ceil(nsample / - opt_sample.shape[0])))[:nsample] + self._logden = np.tile(self._logden, + int(np.ceil(nsample / + opt_sample.shape[0])))[:nsample] # this is our observed unpenalized estimator self.observed = observed.copy() @@ -1274,9 +1163,9 @@ def __init__(self, if normal_sample is None: self._normal_sample = np.random.multivariate_normal( - mean=np.zeros(self.target_cov.shape[0]), - cov=self.target_cov, - size=(nsample,)) + mean=np.zeros(self.target_cov.shape[0]), + cov=self.target_cov, + size=(nsample,)) else: self._normal_sample = normal_sample @@ -1303,12 +1192,11 @@ def pivot(self, nuisance = [] translate_dirs = [] - for (opt_sampler, - opt_sample, - _, - _, + for (opt_sampler, + opt_sample, + _, + _, target_score_cov) in self.opt_sampling_info: - cur_score_cov = linear_func.dot(target_score_cov) # cur_nuisance is in the view's score coordinates @@ -1316,10 +1204,10 @@ def pivot(self, nuisance.append(cur_nuisance) translate_dirs.append(cur_score_cov / target_cov) - weights = self._weights(sample_stat, # normal sample - candidate, # candidate value - nuisance, # nuisance sufficient stats for each view - translate_dirs) # points will be moved like sample * target_score_cov + weights = self._weights(sample_stat, # normal sample + candidate, # candidate value + nuisance, # nuisance sufficient stats for each view + translate_dirs) # points will be moved like sample * target_score_cov pivot = np.mean((sample_stat + candidate <= observed_stat) * weights) / np.mean(weights) @@ -1330,19 +1218,20 @@ def pivot(self, else: return 1 - pivot - def confidence_interval(self, - linear_func, - level=0.90, + def confidence_interval(self, + linear_func, + level=0.90, how_many_sd=20, guess=None): sample_stat = self._normal_sample.dot(linear_func) observed_stat = self.observed.dot(linear_func) - + def _rootU(gamma): return self.pivot(linear_func, observed_stat + gamma, alternative='less') - (1 - level) / 2. + def _rootL(gamma): return self.pivot(linear_func, observed_stat + gamma, @@ -1352,10 +1241,10 @@ def _rootL(gamma): grid_min, grid_max = -how_many_sd * np.std(sample_stat), how_many_sd * np.std(sample_stat) upper = bisect(_rootU, grid_min, grid_max) lower = bisect(_rootL, grid_min, grid_max) - + else: delta = 0.5 * (guess[1] - guess[0]) - + # find interval bracketing upper solution count = 0 while True: @@ -1383,14 +1272,14 @@ def _rootL(gamma): # Private methods - def _weights(self, + def _weights(self, stat_sample, candidate, nuisance, translate_dirs): # Here we should loop through the views - # and move the score of each view + # and move the score of each view # for each projected (through linear_func) normal sample # using the linear decomposition @@ -1421,21 +1310,18 @@ def _weights(self, return np.exp(_logratio) + def naive_confidence_intervals(diag_cov, observed, level=0.9): """ Compute naive Gaussian based confidence intervals for target. Parameters ---------- - diag_cov : diagonal of a covariance matrix - observed : np.float A vector of observed data of shape `target.shape` - alpha : float (optional) 1 - confidence level. - Returns ------- intervals : np.float @@ -1444,223 +1330,72 @@ def naive_confidence_intervals(diag_cov, observed, level=0.9): alpha = 1 - level diag_cov = np.asarray(diag_cov) p = diag_cov.shape[0] - quantile = - ndist.ppf(alpha/2) + quantile = - ndist.ppf(alpha / 2) LU = np.zeros((2, p)) for j in range(p): sigma = np.sqrt(diag_cov[j]) - LU[0,j] = observed[j] - sigma * quantile - LU[1,j] = observed[j] + sigma * quantile + LU[0, j] = observed[j] - sigma * quantile + LU[1, j] = observed[j] + sigma * quantile return LU.T + def naive_pvalues(diag_cov, observed, parameter): diag_cov = np.asarray(diag_cov) p = diag_cov.shape[0] pvalues = np.zeros(p) for j in range(p): sigma = np.sqrt(diag_cov[j]) - pval = ndist.cdf((observed[j] - parameter[j])/sigma) - pvalues[j] = 2 * min(pval, 1-pval) + pval = ndist.cdf((observed[j] - parameter[j]) / sigma) + pvalues[j] = 2 * min(pval, 1 - pval) return pvalues -# private function - -def _solve_barrier_affine_py(conjugate_arg, - precision, - feasible_point, - con_linear, - con_offset, - step=1, - nstep=1000, - min_its=200, - tol=1.e-10): - - scaling = np.sqrt(np.diag(con_linear.dot(precision).dot(con_linear.T))) - - if feasible_point is None: - feasible_point = 1. / scaling - - objective = lambda u: -u.T.dot(conjugate_arg) + u.T.dot(precision).dot(u)/2. \ - + np.log(1.+ 1./((con_offset - con_linear.dot(u))/ scaling)).sum() - grad = lambda u: -conjugate_arg + precision.dot(u) - con_linear.T.dot(1./(scaling + con_offset - con_linear.dot(u)) - - 1./(con_offset - con_linear.dot(u))) - barrier_hessian = lambda u: con_linear.T.dot(np.diag(-1./((scaling + con_offset-con_linear.dot(u))**2.) - + 1./((con_offset-con_linear.dot(u))**2.))).dot(con_linear) - - current = feasible_point - current_value = np.inf - - for itercount in range(nstep): - cur_grad = grad(current) - - # make sure proposal is feasible - - count = 0 - while True: - count += 1 - proposal = current - step * cur_grad - if np.all(con_offset-con_linear.dot(proposal) > 0): - break - step *= 0.5 - if count >= 40: - raise ValueError('not finding a feasible point') - - # make sure proposal is a descent - - count = 0 - while True: - count += 1 - proposal = current - step * cur_grad - proposed_value = objective(proposal) - if proposed_value <= current_value: - break - step *= 0.5 - if count >= 20: - if not (np.isnan(proposed_value) or np.isnan(current_value)): - break - else: - raise ValueError('value is NaN: %f, %f' % (proposed_value, current_value)) - - # stop if relative decrease is small - - if np.fabs(current_value - proposed_value) < tol * np.fabs(current_value) and itercount >= min_its: - current = proposal - current_value = proposed_value - break - - current = proposal - current_value = proposed_value - - if itercount % 4 == 0: - step *= 2 - - hess = np.linalg.inv(precision + barrier_hessian(current)) - return current_value, current, hess - -def _solve_barrier_nonneg(conjugate_arg, - precision, - feasible_point=None, - step=1, - nstep=1000, - tol=1.e-8): - - scaling = np.sqrt(np.diag(precision)) - - if feasible_point is None: - feasible_point = 1. / scaling - - objective = lambda u: -u.T.dot(conjugate_arg) + u.T.dot(precision).dot(u)/2. + np.log(1.+ 1./(u / scaling)).sum() - grad = lambda u: -conjugate_arg + precision.dot(u) + (1./(scaling + u) - 1./u) - barrier_hessian = lambda u: (-1./((scaling + u)**2.) + 1./(u**2.)) - - current = feasible_point - current_value = np.inf - - for itercount in range(nstep): - cur_grad = grad(current) - - # make sure proposal is feasible - - count = 0 - while True: - count += 1 - proposal = current - step * cur_grad - if np.all(proposal > 0): - break - step *= 0.5 - if count >= 40: - raise ValueError('not finding a feasible point') - - # make sure proposal is a descent - - count = 0 - while True: - proposal = current - step * cur_grad - proposed_value = objective(proposal) - if proposed_value <= current_value: - break - step *= 0.5 - if count >= 20: - if not (np.isnan(proposed_value) or np.isnan(current_value)): - break - else: - raise ValueError('value is NaN: %f, %f' % (proposed_value, current_value)) - - # stop if relative decrease is small - - if np.fabs(current_value - proposed_value) < tol * np.fabs(current_value): - current = proposal - current_value = proposed_value - break - - current = proposal - current_value = proposed_value - - if itercount % 4 == 0: - step *= 2 - - hess = np.linalg.inv(precision + np.diag(barrier_hessian(current))) - return current_value, current, hess - - -def selective_MLE(observed_target, - target_cov, - target_score_cov, - init_soln, # initial (observed) value of - # optimization variables -- used as a - # feasible point. precise value used - # only for independent estimator +def selective_MLE(observed_target, + target_cov, + target_score_cov, + init_soln, # initial (observed) value of + # optimization variables -- used as a + # feasible point. precise value used + # only for independent estimator cond_mean, cond_cov, logdens_linear, linear_part, offset, - solve_args={'tol':1.e-12}, + randomizer_prec, + score_offset, + solve_args={'tol': 1.e-12}, level=0.9, useC=False): """ Selective MLE based on approximation of CGF. - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - init_soln : ndarray Feasible point for optimization problem. - cond_mean : ndarray Conditional mean of optimization variables given target. - cond_cov : ndarray Conditional covariance of optimization variables given target. - logdens_linear : ndarray Describes how conditional mean of optimization variables varies with target. - linear_part : ndarray Linear part of affine constraints: $\{o:Ao \leq b\}$ - offset : ndarray Offset part of affine constraints: $\{o:Ao \leq b\}$ - solve_args : dict, optional Arguments passed to solver. - level : float, optional Confidence level. - useC : bool, optional Use python or C solver. - """ if np.asarray(observed_target).shape in [(), (0,)]: @@ -1669,25 +1404,37 @@ def selective_MLE(observed_target, observed_target = np.atleast_1d(observed_target) prec_target = np.linalg.inv(target_cov) + prec_opt = np.linalg.inv(cond_cov) + # target_lin determines how the conditional mean of optimization variables # vary with target # logdens_linear determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - target_lin = - logdens_linear.dot(target_score_cov.T.dot(prec_target)) - target_offset = cond_mean - target_lin.dot(observed_target) + target_linear = target_score_cov.T.dot(prec_target) + target_offset = score_offset - target_linear.dot(observed_target) - prec_opt = np.linalg.inv(cond_cov) + target_lin = - logdens_linear.dot(target_linear) + target_off = cond_mean - target_lin.dot(observed_target) + + if np.asarray(randomizer_prec).shape in [(), (0,)]: + _P = target_linear.T.dot(target_offset) * randomizer_prec + _prec = prec_target + (target_linear.T.dot(target_linear) * randomizer_prec) - target_lin.T.dot(prec_opt).dot( + target_lin) + else: + _P = target_linear.T.dot(randomizer_prec).dot(target_offset) + _prec = prec_target + (target_linear.T.dot(randomizer_prec).dot(target_linear)) - target_lin.T.dot( + prec_opt).dot(target_lin) + + C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) conjugate_arg = prec_opt.dot(cond_mean) - useC= False - print("useC", useC) if useC: solver = solve_barrier_affine_C else: solver = _solve_barrier_affine_py - + val, soln, hess = solver(conjugate_arg, prec_opt, init_soln, @@ -1695,35 +1442,43 @@ def selective_MLE(observed_target, offset, **solve_args) - final_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) - ind_unbiased_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - - init_soln))) + final_estimator = target_cov.dot(_prec).dot(observed_target) \ + + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) + C + + unbiased_estimator = target_cov.dot(_prec).dot(observed_target) + target_cov.dot( + _P - target_lin.T.dot(prec_opt).dot(target_off)) L = target_lin.T.dot(prec_opt) - observed_info_natural = prec_target + L.dot(target_lin) - L.dot(hess.dot(L.T)) + observed_info_natural = _prec + L.dot(target_lin) - L.dot(hess.dot(L.T)) + observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) + pvalues = ndist.cdf(Z_scores) + pvalues = 2 * np.minimum(pvalues, 1 - pvalues) alpha = 1 - level quantile = ndist.ppf(1 - alpha / 2.) - intervals = np.vstack([final_estimator - + + intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), - final_estimator + + final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg)/2. - result = pd.DataFrame({'MLE':final_estimator, - 'SE':np.sqrt(np.diag(observed_info_mean)), - 'Zvalue':Z_scores, - 'pvalue':pvalues, - 'lower_confidence':intervals[:,0], - 'upper_confidence':intervals[:,1], - 'unbiased':ind_unbiased_estimator}) + log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. + + result = pd.DataFrame({'MLE': final_estimator, + 'SE': np.sqrt(np.diag(observed_info_mean)), + 'Zvalue': Z_scores, + 'pvalue': pvalues, + 'lower_confidence': intervals[:, 0], + 'upper_confidence': intervals[:, 1], + 'unbiased': unbiased_estimator}) return result, observed_info_mean, log_ref + def normalizing_constant(target_parameter, observed_target, target_cov, @@ -1735,49 +1490,38 @@ def normalizing_constant(target_parameter, linear_part, offset, useC=False): - """ - Approximation of normalizing constant in affine constrained Gaussian. - Parameters ---------- - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - + target_score_cov : ndarray Estimated covariance of target and score of randomized query. - + init_soln : ndarray Feasible point for optimization problem. - cond_mean : ndarray Conditional mean of optimization variables given target. - cond_cov : ndarray Conditional covariance of optimization variables given target. - + logdens_linear : ndarray Describes how conditional mean of optimization variables varies with target. - + linear_part : ndarray Linear part of affine constraints: $\{o:Ao \leq b\}$ - offset : ndarray Offset part of affine constraints: $\{o:Ao \leq b\}$ - solve_args : dict, optional Arguments passed to solver. - level : float, optional Confidence level. - useC : bool, optional Use python or C solver. """ @@ -1799,25 +1543,25 @@ def normalizing_constant(target_parameter, nopt = cond_cov.shape[0] full_Q = np.zeros((ntarget + nopt, ntarget + nopt)) - full_Q[:ntarget][:,:ntarget] = (prec_target + target_linear.T.dot(cond_precision.dot(target_linear))) - full_Q[:ntarget][:,ntarget:] = -target_linear.dot(cond_precision) - full_Q[ntarget:][:,:ntarget] = (-target_linear.dot(cond_precision)).T - full_Q[ntarget:][:,ntarget:] = cond_precision + full_Q[:ntarget][:, :ntarget] = (prec_target + target_linear.T.dot(cond_precision.dot(target_linear))) + full_Q[:ntarget][:, ntarget:] = -target_linear.dot(cond_precision) + full_Q[ntarget:][:, :ntarget] = (-target_linear.dot(cond_precision)).T + full_Q[ntarget:][:, ntarget:] = cond_precision - linear_term = np.hstack([-prec_target.dot(target_parameter) + - corrected_mean.dot(cond_precision).dot(target_linear), - -cond_precision.dot(corrected_mean)]) + linear_term = np.hstack([-prec_target.dot(target_parameter) + + corrected_mean.dot(cond_precision).dot(target_linear), + -cond_precision.dot(corrected_mean)]) constant_term = 0.5 * (np.sum(target_parameter * prec_target.dot(target_parameter)) + np.sum(corrected_mean * cond_precision.dot(corrected_mean))) full_con_linear = np.zeros((linear_part.shape[0], ntarget + nopt)) - full_con_linear[:,ntarget:] = linear_part + full_con_linear[:, ntarget:] = linear_part full_feasible = np.zeros(ntarget + nopt) full_feasible[ntarget:] = feasible_point - solve_args={'tol':1.e-12} + solve_args = {'tol': 1.e-12} if useC: solver = solve_barrier_affine_C @@ -1825,34 +1569,33 @@ def normalizing_constant(target_parameter, solver = _solve_barrier_affine_py value, soln, hess = solver(-linear_term, - full_Q, - full_feasible, - full_con_linear, - offset, - **solve_args) - return (-value + 0.5 * np.sum(target_parameter * prec_target.dot(target_parameter)), - soln[:ntarget], - hess[:ntarget][:,:ntarget]) + full_Q, + full_feasible, + full_con_linear, + offset, + **solve_args) + return (-value + 0.5 * np.sum(target_parameter * prec_target.dot(target_parameter)), + soln[:ntarget], + hess[:ntarget][:, :ntarget]) def _bisect(f, lb, ub, min_iter=20, max_iter=100, tol=1.e-3): - while True: sign_l = np.sign(f(lb)) sign_u = np.sign(f(ub)) mid = 0.5 * (lb + ub) f_mid = f(mid) if sign_l == 1: - if f_mid > 0: # we should move closer to upper + if f_mid > 0: # we should move closer to upper lb = mid else: ub = mid else: - if f_mid > 0: # we should move closer to lower + if f_mid > 0: # we should move closer to lower ub = mid else: lb = mid - + if np.fabs(f_mid) < tol: break - return mid + return mid \ No newline at end of file diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 17ecb0423..e846f60e3 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -4,12 +4,14 @@ from selectinf.randomized.lasso import lasso, full_targets, selected_targets, debiased_targets from selectinf.tests.instance import gaussian_instance -def test_full_targets(n=200, - p=1000, - signal_fac=0.5, - s=5, sigma=3, - rho=0.4, - randomizer_scale=0.5, + +def test_full_targets(n=200, + p=1000, + signal_fac=0.5, + s=5, + sigma=3, + rho=0.4, + randomizer_scale=0.7, full_dispersion=False): """ Compare to R randomized lasso @@ -22,7 +24,7 @@ def test_full_targets(n=200, p=p, signal=signal, s=s, - equicorrelated=False, + equicorrelated=True, rho=rho, sigma=sigma, random_signs=True)[:3] @@ -51,7 +53,7 @@ def test_full_targets(n=200, else: dispersion = None - if n>p: + if n > p: (observed_target, cov_target, cov_target_score, @@ -81,13 +83,13 @@ def test_full_targets(n=200, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def test_selected_targets(n=2000, - p=200, - signal_fac=1., - s=5, - sigma=3, - rho=0.4, - randomizer_scale=1, +def test_selected_targets(n=2000, + p=200, + signal_fac=1.2, + s=5, + sigma=2, + rho=0.7, + randomizer_scale=1., full_dispersion=True): """ Compare to R randomized lasso @@ -101,7 +103,7 @@ def test_selected_targets(n=2000, p=p, signal=signal, s=s, - equicorrelated=False, + equicorrelated=True, rho=rho, sigma=sigma, random_signs=True)[:3] @@ -113,11 +115,12 @@ def test_selected_targets(n=2000, n, p = X.shape sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + W = 0.8 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ conv = const(X, Y, W, + ridge_term=0., randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() @@ -134,7 +137,7 @@ def test_selected_targets(n=2000, cov_target_score, alternatives) = selected_targets(conv.loglike, conv._W, - nonzero, + nonzero, dispersion=dispersion) result = conv.selective_MLE(observed_target, @@ -143,18 +146,17 @@ def test_selected_targets(n=2000, estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) - + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - print("observed_opt_state ", conv.observed_opt_state) # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def test_instance(): +def test_instance(): n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) @@ -169,6 +171,7 @@ def test_instance(): M = E.copy() M[-3:] = 1 + print("check ", M) dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) (observed_target, cov_target, @@ -191,10 +194,11 @@ def test_instance(): coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) print("observed_opt_state ", L.observed_opt_state) - #print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) + # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) return coverage + # def main(nsim=500): # # cover = [] @@ -279,7 +283,7 @@ def main(nsim=500, full=False): P0, PA, cover, length_int = [], [], [], [] from statsmodels.distributions import ECDF - n, p, s = 500, 100, 5 + n, p, s = 500, 100, 0 for i in range(nsim): if full: @@ -291,8 +295,7 @@ def main(nsim=500, full=False): avg_length = intervals[:, 1] - intervals[:, 0] else: full_dispersion = True - p0, pA, cover_, intervals = test_selected_targets_disperse(n=n, p=p, s=int(p/2), - full_dispersion=full_dispersion) + p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) avg_length = intervals[:, 1] - intervals[:, 0] cover.extend(cover_) @@ -303,5 +306,6 @@ def main(nsim=500, full=False): # np.mean(avg_length), 'null pvalue + power + length') print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) + if __name__ == "__main__": - main(nsim=100) + main(nsim=50) \ No newline at end of file From 24ab71c1fbc1213a8f85212b138d364faa58c0f3 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 7 Jun 2021 12:23:17 -0400 Subject: [PATCH 092/187] updated posterior inference --- selectinf/randomized/posterior_inference.py | 114 ++++++++++--------- selectinf/randomized/tests/test_posterior.py | 78 ++++++++----- 2 files changed, 109 insertions(+), 83 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 403a5a1f0..ef2d184a5 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -4,31 +4,25 @@ from scipy.stats import norm as ndist, invgamma from scipy.linalg import fractional_matrix_power -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from ..algorithms.barrier_affine import solve_barrier_affine_py -class posterior(object): +class posterior(object): """ Parameters ---------- - observed_target : ndarray Observed estimate of target. - cov_target : ndarray Estimated covariance of target. - cov_target_score : ndarray Estimated covariance of target and score of randomized query. - prior : callable A callable object that takes a single argument `parameter` of the same shape as `observed_target` and returns (value of log prior, gradient of log prior) - dispersion : float, optional - A dispersion parameter for likelihood. - + A dispersion parameter for likelihood. solve_args : dict Arguments passed to solver of affine barrier problem. """ @@ -40,29 +34,34 @@ def __init__(self, cov_target_score, prior, dispersion=1, - solve_args={'tol':1.e-12}): + solve_args={'tol': 1.e-12}): self.solve_args = solve_args - + linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset logdens_linear = query.sampler.logdens_transform[0] + _, randomizer_prec = query.randomizer.cov_prec + score_offset = query.observed_score_state + query.sampler.logdens_transform[1] result, self.inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, cov_target_score) - + ### Note for an informative prior we might want to change this... - + self.ntarget = cov_target.shape[0] self.nopt = query.cond_cov.shape[0] self.cond_precision = np.linalg.inv(query.cond_cov) + self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) self.observed_target = observed_target self.cov_target_score = cov_target_score self.logdens_linear = logdens_linear + self.randomizer_prec = randomizer_prec + self.score_offset = score_offset self.feasible_point = query.observed_opt_state self.cond_mean = query.cond_mean @@ -82,29 +81,24 @@ def log_posterior(self, sigma=1): """ - Parameters ---------- - target_parameter : ndarray Value of parameter at which to evaluate posterior and its gradient. - sigma : ndarray Noise standard deviation. - """ - sigmasq = sigma**2 - mean_marginal = self.linear_coef.dot(target_parameter) + self.offset_coef + sigmasq = sigma ** 2 + + target = self.S.dot(target_parameter) + self.r + + mean_marginal = self.linear_coef.dot(target) + self.offset_coef prec_marginal = self.prec_marginal conjugate_marginal = prec_marginal.dot(mean_marginal) - useC = True - if useC: - solver = solve_barrier_affine_C - else: - solver = _solve_barrier_affine_py + solver = solve_barrier_affine_py val, soln, hess = solver(conjugate_marginal, prec_marginal, @@ -113,19 +107,18 @@ def log_posterior(self, self.offset, **self.solve_args) - log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal)/2. + log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal) / 2. - log_lik = -(((self.observed_target - target_parameter).T.dot(self.prec_target).dot(self.observed_target - target_parameter)) / 2. - - log_normalizer) + log_lik = -(((self.observed_target - target).T.dot(self._prec).dot( + self.observed_target - target)) / 2. - log_normalizer) - grad_lik = (self.prec_target.dot(self.observed_target) - - self.prec_target.dot(target_parameter) \ - - self.linear_coef.T.dot(prec_marginal.dot(soln)- conjugate_marginal)) + grad_lik = self.S.T.dot(self._prec.dot(self.observed_target) - self._prec.dot(target) - self.linear_coef.T.dot( + prec_marginal.dot(soln) - conjugate_marginal)) log_prior, grad_prior = self.prior(target_parameter) return (self.dispersion * (log_lik - self.log_ref) / sigmasq + log_prior, - self.dispersion * grad_lik/sigmasq + grad_prior) + self.dispersion * grad_lik / sigmasq + grad_prior) ### Private method @@ -136,25 +129,37 @@ def _set_marginal_parameters(self): of randomization as well how to compute implied mean as a function of the true parameters. """ - target_linear = -self.logdens_linear.dot(self.cov_target_score.T.dot(self.prec_target)) - implied_precision = np.zeros((self.ntarget + self.nopt, self.ntarget + self.nopt)) - implied_precision[:self.ntarget][:,:self.ntarget] = (self.prec_target + - target_linear.T.dot(self.cond_precision.dot(target_linear))) - implied_precision[:self.ntarget][:,self.ntarget:] = -target_linear.T.dot(self.cond_precision) - implied_precision[self.ntarget:][:,:self.ntarget] = (-target_linear.T.dot(self.cond_precision)).T - implied_precision[self.ntarget:][:,self.ntarget:] = self.cond_precision + target_linear = self.cov_target_score.T.dot(self.prec_target) + target_offset = self.score_offset - target_linear.dot(self.observed_target) + + target_lin = -self.logdens_linear.dot(target_linear) + target_off = self.cond_mean - target_lin.dot(self.observed_target) + + self.linear_coef = target_lin + self.offset_coef = self.cond_mean - target_lin.dot(self.observed_target) + + if np.asarray(self.randomizer_prec).shape in [(), (0,)]: + _prec = self.prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) \ + - target_lin.T.dot(self.cond_precision).dot(target_lin) + _P = target_linear.T.dot(target_offset) * self.randomizer_prec + else: + _prec = self.prec_target + (target_linear.T.dot(self.randomizer_prec).dot(target_linear)) \ + - target_lin.T.dot(self.cond_precision).dot(target_lin) + _P = target_linear.T.dot(self.randomizer_prec).dot(target_offset) + + _Q = np.linalg.inv(_prec + target_lin.T.dot(self.cond_precision).dot(target_lin)) + self.prec_marginal = self.cond_precision - self.cond_precision.dot(target_lin).dot(_Q).dot(target_lin.T).dot( + self.cond_precision) - implied_cov = np.linalg.inv(implied_precision) - self.linear_coef = implied_cov[self.ntarget:][:,:self.ntarget].dot(self.prec_target) + r = np.linalg.inv(_prec).dot(target_lin.T.dot(self.cond_precision).dot(target_off) - _P) + S = np.linalg.inv(_prec).dot(self.prec_target) - target_offset = self.cond_mean - target_linear.dot(self.observed_target) - M = implied_cov[self.ntarget:][:,self.ntarget:].dot(self.cond_precision.dot(target_offset)) - N = -target_linear.T.dot(self.cond_precision).dot(target_offset) - self.offset_coef = implied_cov[self.ntarget:][:,:self.ntarget].dot(N) + M + self.r = r + self.S = S + # print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) + self._prec = _prec - self.cov_marginal = implied_cov[self.ntarget:][:,self.ntarget:] - self.prec_marginal = np.linalg.inv(self.cov_marginal) ### sampling methods @@ -163,7 +168,6 @@ def langevin_sampler(selective_posterior, nburnin=100, proposal_scale=None, step=1.): - state = selective_posterior.initial_estimate stepsize = 1. / (step * selective_posterior.ntarget) @@ -180,20 +184,20 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) - samples[i,:] = sample.copy() + samples[i, :] = sample.copy() if i == nsample - 1: break return samples[nburnin:, :] + def gibbs_sampler(selective_posterior, nsample=2000, nburnin=100, proposal_scale=None, step=1.): - state = selective_posterior.initial_estimate - stepsize = 1./(step*selective_posterior.ntarget) + stepsize = 1. / (step * selective_posterior.ntarget) if proposal_scale is None: proposal_scale = selective_posterior.inverse_info @@ -207,20 +211,20 @@ def gibbs_sampler(selective_posterior, scale_samples = np.zeros(nsample) scale_update = np.sqrt(selective_posterior.dispersion) for i in range(nsample): - sample = sampler.__next__() samples[i, :] = sample scale_update_sq = invgamma.rvs(a=(0.1 + - selective_posterior.ntarget + - selective_posterior.ntarget/2), - scale=0.1-((scale_update**2)*sampler.posterior_[0]), + selective_posterior.ntarget + + selective_posterior.ntarget / 2), + scale=0.1 - ((scale_update ** 2) * sampler.posterior_[0]), size=1) scale_samples[i] = np.sqrt(scale_update_sq) sampler.scaling = np.sqrt(scale_update_sq) return samples[nburnin:, :], scale_samples[nburnin:] + class langevin(object): def __init__(self, @@ -254,7 +258,7 @@ def __next__(self): while True: self.posterior_ = self.gradient_map(self.state, self.scaling) candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) - + np.sqrt(2.)* (self.proposal_sqrt.dot(self._noise.rvs(self._shape))) * self._sqrt_step) + + np.sqrt(2.) * (self.proposal_sqrt.dot(self._noise.rvs(self._shape))) * self._sqrt_step) if not np.all(np.isfinite(self.gradient_map(candidate, self.scaling)[1])): self.stepsize *= 0.5 diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index c9e3fc118..1b369c351 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -7,6 +7,7 @@ from ..posterior_inference import (langevin_sampler, gibbs_sampler) + def test_Langevin(n=500, p=100, signal_fac=1., @@ -16,7 +17,6 @@ def test_Langevin(n=500, randomizer_scale=1., nsample=1500, nburnin=100): - inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -39,6 +39,7 @@ def test_Langevin(n=500, conv = const(X, Y, W, + ridge_term=0., randomizer_scale=randomizer_scale * dispersion) signs = conv.fit() @@ -74,12 +75,32 @@ def test_Langevin(n=500, return np.mean(coverage), np.mean(length) -def test_instance(nsample=100, nburnin=50): +def test_coverage(nsim=100): + cov, len = 0., 0. + + for i in range(nsim): + cov_, len_ = test_Langevin(n=500, + p=100, + signal_fac=1., + s=5, + sigma=3., + rho=0.2, + randomizer_scale=1., + nsample=1500, + nburnin=100) + + cov += cov_ + len += len_ + + print("coverage and lengths ", i, cov / (i + 1.), len / (i + 1.)) + + +def test_instance(nsample=100, nburnin=50): n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) - #beta[:s] = np.sqrt(2 * np.log(p) / n) + # beta[:s] = np.sqrt(2 * np.log(p) / n) Y = X.dot(beta) + np.random.standard_normal(n) scale_ = np.std(Y) @@ -115,7 +136,6 @@ def test_instance(nsample=100, nburnin=50): lci = np.percentile(samples, 5, axis=0) uci = np.percentile(samples, 95, axis=0) - beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) coverage = (lci < beta_target) * (uci > beta_target) length = uci - lci @@ -124,12 +144,11 @@ def test_instance(nsample=100, nburnin=50): def test_flexible_prior1(nsample=100, nburnin=50): - np.random.seed(0) n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) - #beta[:s] = np.sqrt(2 * np.log(p) / n) + # beta[:s] = np.sqrt(2 * np.log(p) / n) Y = X.dot(beta) + np.random.standard_normal(n) scale_ = np.std(Y) @@ -150,9 +169,10 @@ def test_flexible_prior1(nsample=100, nburnin=50): dispersion=dispersion) Di = 1. / (200 * np.diag(cov_target)) + def prior(target_parameter): grad_prior = -target_parameter * Di - log_prior = -np.sum(target_parameter**2 * Di) + log_prior = -np.sum(target_parameter ** 2 * Di) return log_prior, grad_prior seed_state = np.random.get_state() @@ -181,14 +201,13 @@ def prior(target_parameter): np.testing.assert_equal(Z1, Z2) np.testing.assert_equal(W1, W2) np.testing.assert_allclose(samples1, samples2, rtol=1.e-3) - -def test_flexible_prior2(nsample=1000, nburnin=50): +def test_flexible_prior2(nsample=1000, nburnin=50): n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) - #beta[:s] = np.sqrt(2 * np.log(p) / n) + # beta[:s] = np.sqrt(2 * np.log(p) / n) Y = X.dot(beta) + np.random.standard_normal(n) scale_ = np.std(Y) @@ -208,10 +227,11 @@ def test_flexible_prior2(nsample=1000, nburnin=50): M, dispersion=dispersion) - prior_var = 0.05**2 + prior_var = 0.05 ** 2 + def prior(target_parameter): grad_prior = -target_parameter / prior_var - log_prior = -np.linalg.norm(target_parameter)**2 /(2. * prior_var) + log_prior = -np.linalg.norm(target_parameter) ** 2 / (2. * prior_var) return log_prior, grad_prior posterior_inf = L.posterior(observed_target, @@ -220,19 +240,19 @@ def prior(target_parameter): dispersion=dispersion, prior=prior) adaptive_proposal = np.linalg.inv(np.linalg.inv(posterior_inf.inverse_info) + - np.identity(posterior_inf.inverse_info.shape[0]) / 0.05**2) + np.identity(posterior_inf.inverse_info.shape[0]) / 0.05 ** 2) samples = langevin_sampler(posterior_inf, nsample=nsample, proposal_scale=adaptive_proposal, nburnin=nburnin) return samples - + + def test_hiv_data(nsample=10000, nburnin=500, level=0.90, split_proportion=0.50, - seedn = 1): - + seedn=1): np.random.seed(seedn) alpha = (1 - level) / 2 @@ -242,7 +262,7 @@ def test_hiv_data(nsample=10000, Y *= 15 n, p = X.shape X /= np.sqrt(n) - + ols_fit = np.linalg.pinv(X).dot(Y) _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) @@ -272,7 +292,7 @@ def test_hiv_data(nsample=10000, cov_target, cov_target_score, level=level, - solve_args={'tol':1.e-12})[:2] + solve_args={'tol': 1.e-12})[:2] approx_inf = conv.approximate_grid_inference(observed_target, cov_target, @@ -288,15 +308,15 @@ def test_hiv_data(nsample=10000, nburnin=nburnin, step=1.) - lower_langevin = np.percentile(samples_langevin, int(alpha*100), axis=0) - upper_langevin = np.percentile(samples_langevin, int((1-alpha)*100), axis=0) + lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) + upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, nsample=nsample, nburnin=nburnin) - lower_gibbs = np.percentile(samples_gibbs, int(alpha* 100), axis=0) - upper_gibbs = np.percentile(samples_gibbs, int((1-alpha)*100), axis=0) + lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) + upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) @@ -313,16 +333,16 @@ def test_hiv_data(nsample=10000, print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", np.mean(upper_langevin - lower_langevin), np.mean(upper_gibbs - lower_gibbs), - np.mean((2*Z_quantile)*np.sqrt(np.diag(posterior_inf.inverse_info))), + np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), np.mean(mle['upper_confidence'] - mle['lower_confidence']), np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) - ) + ) - print("lengths: naive intervals ", np.mean(naive_intervals[:,1]-naive_intervals[:,0])) + print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) - scale_interval = np.percentile(scale_gibbs, [alpha*100, (1-alpha)*100]) + scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, 'Langevin_upper_credible': upper_langevin, 'Gibbs_lower_credible': lower_gibbs, @@ -331,7 +351,7 @@ def test_hiv_data(nsample=10000, 'MLE_upper_confidence': mle['upper_confidence'], 'approx_lower_confidence': approx_inf['lower_confidence'], 'approx_upper_confidence': approx_inf['upper_confidence'], - 'Split_lower_confidence': split_intervals[:,0], + 'Split_lower_confidence': split_intervals[:, 0], 'Split_upper_confidence': split_intervals[:, 1], 'Naive_lower_confidence': naive_intervals[:, 0], 'Naive_upper_confidence': naive_intervals[:, 1] @@ -339,7 +359,9 @@ def test_hiv_data(nsample=10000, return output, scale_interval, _sigma + if __name__ == "__main__": - test_hiv_data(split_proportion=0.50) + # test_hiv_data(split_proportion=0.50) + test_coverage(nsim=100) From 7062cd31d76f911ce5fb61a08ce0bcb0e1b1768e Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 7 Jun 2021 12:27:02 -0400 Subject: [PATCH 093/187] updates to approx_reference --- selectinf/randomized/approx_reference.py | 135 +++++++++++------- .../randomized/tests/test_approx_reference.py | 57 +++----- 2 files changed, 107 insertions(+), 85 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 0041cccb7..a706b6789 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -3,8 +3,9 @@ import numpy as np, pandas as pd from scipy.interpolate import interp1d -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from ..distributions.discrete_family import discrete_family +from ..algorithms.barrier_affine import solve_barrier_affine_py + class approximate_grid_inference(object): @@ -13,31 +14,24 @@ def __init__(self, observed_target, target_cov, target_score_cov, - solve_args={'tol':1.e-12}): + solve_args={'tol': 1.e-12}): """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - query : `gaussian_query` A Gaussian query which has information to describe implied Gaussian. - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - solve_args : dict, optional Arguments passed to solver. - """ self.solve_args = solve_args @@ -46,8 +40,7 @@ def __init__(self, target_cov, target_score_cov, solve_args=solve_args)[:2] - mle = result['MLE'] - + self.linear_part = query.sampler.affine_con.linear_part self.offset = query.sampler.affine_con.offset @@ -62,15 +55,20 @@ def __init__(self, self.init_soln = query.observed_opt_state + self.randomizer_prec = query.sampler.randomizer_prec + self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] + self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) ngrid = 60 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): - self.stat_grid[j,:] = np.linspace(observed_target[j] - 1.5*_scale[j], - observed_target[j] + 1.5*_scale[j], - num=ngrid) + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + + self.opt_linear = query.opt_linear def summary(self, alternatives=None, @@ -79,20 +77,15 @@ def summary(self, """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - parameter : np.array Hypothesized value for parameter -- defaults to 0. - level : float Confidence level. - """ if parameter is not None: @@ -102,13 +95,13 @@ def summary(self, pivots = None pvalues = self._approx_pivots(np.zeros_like(self.observed_target), - alternatives=alternatives) + alternatives=alternatives) lower, upper = self._approx_intervals(level=level) - result = pd.DataFrame({'target':self.observed_target, - 'pvalue':pvalues, - 'lower_confidence':lower, - 'upper_confidence':upper}) + result = pd.DataFrame({'target': self.observed_target, + 'pvalue': pvalues, + 'lower_confidence': lower, + 'upper_confidence': upper}) if not np.all(parameter == 0): result.insert(4, 'pivot', pivots) @@ -117,31 +110,30 @@ def summary(self, return result def _approx_log_reference(self, - observed_target, - target_cov, - target_score_cov, - grid): + observed_target, + target_cov, + target_score_cov, + grid): """ Approximate the log of the reference density on a grid. - """ if np.asarray(observed_target).shape in [(), (0,)]: - raise ValueError('no target specified') + raise ValueError('no target specified') prec_target = np.linalg.inv(target_cov) target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] - solver = solve_barrier_affine_C + solver = solve_barrier_affine_py for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, # target_lin is "something" times Gamma, # where "something" comes from implied Gaussian # cond_mean is "something" times D # Gamma is target_score_cov.T.dot(prec_target) - - cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + + cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) conjugate_arg = self.prec_opt.dot(cond_mean_grid) @@ -158,14 +150,19 @@ def _approx_log_reference(self, def _construct_families(self): + self._construct_density() + self._families = [] + for m in range(self.ntarget): p = self.target_score_cov.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - var_target = target_cov_uni[0, 0] target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + var_target = 1. / ((self.precs[m])[0, 0]) + approx_log_ref = self._approx_log_reference(observed_target_uni, target_cov_uni, target_score_cov_uni, @@ -179,15 +176,14 @@ def _construct_families(self): grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m])**2 / var_target) + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) logW -= logW.max() - weights = np.exp(logW) # construction of families follows `selectinf.learning.core` - + self._families.append(discrete_family(grid, - weights)) - + np.exp(logW))) + # logG = - 0.5 * grid**2 / var_target # logG -= logG.max() # import matplotlib.pyplot as plt @@ -207,22 +203,24 @@ def _approx_pivots(self, if not hasattr(self, "_families"): self._construct_families() - + if alternatives is None: alternatives = ['twosided'] * self.ntarget pivot = [] + p = self.target_score_cov.shape[1] for m in range(self.ntarget): + family = self._families[m] - observed_target = self.observed_target[m] - var_target = self.target_cov[m, m] + var_target = 1. / ((self.precs[m])[0, 0]) + mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] + #print("mean ", np.allclose(mean[0], mean_parameter[m]), self.r[m], self.S[m]) # construction of pivot from families follows `selectinf.learning.core` - _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, - x=observed_target) - #_cdf = family.cdf(mean_parameter[m]/var_target, x=observed_target) + _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) + if alternatives[m] == 'twosided': pivot.append(2 * min(_cdf, 1 - _cdf)) elif alternatives[m] == 'greater': @@ -238,18 +236,57 @@ def _approx_intervals(self, if not hasattr(self, "_families"): self._construct_families() - + lower, upper = [], [] for m in range(self.ntarget): # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, - alpha=1-level) - var_target = self.target_cov[m, m] - lower.append(l * var_target + observed_target) + alpha=1 - level) + + var_target = 1. / ((self.precs[m])[0, 0]) + + lower.append(l * var_target + observed_target) upper.append(u * var_target + observed_target) return np.asarray(lower), np.asarray(upper) + ### Private method + def _construct_density(self): + + precs = {} + S = {} + r = {} + + p = self.target_score_cov.shape[1] + + for m in range(self.ntarget): + observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + prec_target = 1. / target_cov_uni + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + + target_linear = target_score_cov_uni.T.dot(prec_target) + target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( + (target_linear.shape[0],)) + + target_lin = -self.logdens_linear.dot(target_linear) + target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + + _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + self.prec_opt).dot(target_lin) + + _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _S = np.linalg.inv(_prec).dot(prec_target) + + S[m] = _S + r[m] = _r + precs[m] = _prec + + self.precs = precs + self.S = S + self.r = r \ No newline at end of file diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index aaf2544c4..62c83f7fa 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -63,10 +63,10 @@ def test_approx_pivot(n=500, rho=0.4, randomizer_scale=1.): - while True: + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) + while True: X, Y, beta = inst(n=n, p=p, @@ -75,55 +75,39 @@ def test_approx_pivot(n=500, equicorrelated=True, rho=rho, sigma=sigma, - random_signs=True)[:3] + random_signs=False)[:3] n, p = X.shape sigma_ = np.std(Y) - if n > p: + if n > (2 * p): dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) else: dispersion = sigma_ ** 2 - # W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ eps = np.random.standard_normal((n, 2000)) * Y.std() - lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) + lam_theory = 0.6 * np.median(np.abs(X.T.dot(eps)).max(1)) W = lam_theory * np.ones(p) conv = const(X, Y, W, - ridge_term=0.) - # randomizer_scale=randomizer_scale * dispersion) + ridge_term=0., + randomizer_scale=randomizer_scale * dispersion) signs = conv.fit() nonzero = signs != 0 - print("number of selected ", nonzero.sum()) if nonzero.sum() > 0: - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - if n > p: - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - else: - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=sigma ** 2) - - inverse_info = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[1] + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) approximate_grid_inf = approximate_grid_inference(conv, observed_target, @@ -134,6 +118,7 @@ def test_approx_pivot(n=500, return pivot + def test_approx_ci(n=500, p=100, signal_fac=1., @@ -218,9 +203,9 @@ def main(nsim=300, CI = False): if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=100, - p=400, - signal_fac=0.5, + _pivot.extend(test_approx_pivot(n=400, + p=100, + signal_fac=1., s=0, sigma=1., rho=0.30, @@ -244,7 +229,7 @@ def main(nsim=300, CI = False): signal_fac=1., s=5, sigma=3., - rho=0.3, + rho=0.4, randomizer_scale=1.) coverage_ += cov From 93e808f64533a3019ee57dd09cf10e4aaa5e883f Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sat, 12 Jun 2021 23:54:28 -0400 Subject: [PATCH 094/187] removed interp1d for now to compute reference on a grid --- selectinf/randomized/approx_reference.py | 1 - selectinf/randomized/exact_reference.py | 130 ++++++---- .../randomized/tests/test_approx_reference.py | 16 +- .../randomized/tests/test_exact_reference.py | 224 ++++++------------ 4 files changed, 161 insertions(+), 210 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index a706b6789..62ab28f56 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -208,7 +208,6 @@ def _approx_pivots(self, alternatives = ['twosided'] * self.ntarget pivot = [] - p = self.target_score_cov.shape[1] for m in range(self.ntarget): diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 5e5c43db8..96fab032e 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -18,35 +18,25 @@ def __init__(self, """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - query : `gaussian_query` A Gaussian query which has information to describe implied Gaussian. - observed_target : ndarray Observed estimate of target. - target_cov : ndarray Estimated covaraince of target. - target_score_cov : ndarray Estimated covariance of target and score of randomized query. - solve_args : dict, optional Arguments passed to solver. - """ - self.solve_args = solve_args - result, inverse_info = query.selective_MLE(observed_target, target_cov, target_score_cov, solve_args=solve_args)[:2] - mle = result['MLE'] self.linear_part = query.sampler.affine_con.linear_part self.offset = query.sampler.affine_con.offset @@ -62,16 +52,21 @@ def __init__(self, self.init_soln = query.observed_opt_state + self.randomizer_prec = query.sampler.randomizer_prec + self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] + self.ntarget = ntarget = target_cov.shape[0] - _scale = 4. * np.sqrt(np.diag(inverse_info)) - ngrid = 40 + _scale = 4 * np.sqrt(np.diag(inverse_info)) + ngrid = 1000 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1. * _scale[j], - observed_target[j] + 1. * _scale[j], + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], num=ngrid) + self.opt_linear = query.opt_linear + def summary(self, alternatives=None, parameter=None, @@ -79,31 +74,26 @@ def summary(self, """ Produce p-values and confidence intervals for targets of model including selected features - Parameters ---------- - alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] - parameter : np.array Hypothesized value for parameter -- defaults to 0. - level : float Confidence level. - """ if parameter is not None: - pivots = self.approx_pivots(parameter, + pivots = self._pivots(parameter, alternatives=alternatives) else: pivots = None - pvalues = self._approx_pivots(np.zeros_like(self.observed_target), + pvalues = self._pivots(np.zeros_like(self.observed_target), alternatives=alternatives) - lower, upper = self._approx_intervals(level=level) + lower, upper = self._intervals(level=level) result = pd.DataFrame({'target': self.observed_target, 'pvalue': pvalues, @@ -152,21 +142,23 @@ def log_reference(self, implied_prec = 1./implied_cov _A = self.cond_cov.dot(eta) * implied_prec + R = np.identity(num_opt) - _A.dot(eta.T) + A = self.linear_part.dot(_A).reshape((-1,)) - b = self.linear_part.dot((-np.identity(num_opt) + _A.dot(eta.T)).dot(self.init_soln)) + b = -self.linear_part.dot(R).dot(self.init_soln) + + trunc_ = np.true_divide((self.offset + b), A) neg_indx = np.asarray([j for j in range(num_con) if A[j] < 0.]) pos_indx = np.asarray([j for j in range(num_con) if A[j] > 0.]) - trunc_ = (self.offset + b) / A - if pos_indx.shape[0]>0 and neg_indx.shape[0]>0: trunc_lower = np.max(trunc_[neg_indx]) trunc_upper = np.min(trunc_[pos_indx]) - lower_limit = (trunc_lower - implied_mean) * implied_prec - upper_limit = (trunc_upper - implied_mean) * implied_prec + lower_limit = (trunc_lower - implied_mean) * np.sqrt(implied_prec) + upper_limit = (trunc_upper - implied_mean) * np.sqrt(implied_prec) ref_hat.append(np.log(ndist.cdf(upper_limit) - ndist.cdf(lower_limit))) @@ -174,7 +166,7 @@ def log_reference(self, trunc_upper = np.min(trunc_[pos_indx]) - upper_limit = (trunc_upper - implied_mean) * implied_prec + upper_limit = (trunc_upper - implied_mean) * np.sqrt(implied_prec) ref_hat.append(np.log(ndist.cdf(upper_limit))) @@ -182,7 +174,7 @@ def log_reference(self, trunc_lower = np.max(trunc_[neg_indx]) - lower_limit = (trunc_lower - implied_mean) * implied_prec + lower_limit = (trunc_lower - implied_mean) * np.sqrt(implied_prec) ref_hat.append(np.log(1. - ndist.cdf(lower_limit))) @@ -190,33 +182,30 @@ def log_reference(self, def _construct_families(self): + self._construct_density() + self._families = [] + for m in range(self.ntarget): p = self.target_score_cov.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - var_target = target_cov_uni[0, 0] target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + var_target = 1. / ((self.precs[m])[0, 0]) + log_ref = self.log_reference(observed_target_uni, target_cov_uni, target_score_cov_uni, self.stat_grid[m]) - grid_approx_fn = interp1d(self.stat_grid[m], - log_ref, - kind='quadratic', - bounds_error=False, - fill_value='extrapolate') - - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) - logW = (grid_approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) logW -= logW.max() # construction of families follows `selectinf.learning.core` - self._families.append(discrete_family(grid, + self._families.append(discrete_family(self.stat_grid[m], np.exp(logW))) def _pivots(self, @@ -228,19 +217,18 @@ def _pivots(self, if alternatives is None: alternatives = ['twosided'] * self.ntarget - else: - alternatives = [alternatives] *self.ntarget + pivot = [] for m in range(self.ntarget): + family = self._families[m] - observed_target = self.observed_target[m] - var_target = self.target_cov[m, m] + var_target = 1. / ((self.precs[m])[0, 0]) + + mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] - # construction of pivot from families follows `selectinf.learning.core` + _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) - _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, - x=observed_target) if alternatives[m] == 'twosided': pivot.append(2 * min(_cdf, 1 - _cdf)) elif alternatives[m] == 'greater': @@ -263,10 +251,54 @@ def _intervals(self, # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, alpha=1 - level) - var_target = self.target_cov[m, m] + + var_target = 1. / ((self.precs[m])[0, 0]) + lower.append(l * var_target + observed_target) upper.append(u * var_target + observed_target) - return np.asarray(lower), np.asarray(upper) \ No newline at end of file + return np.asarray(lower), np.asarray(upper) + + ### Private method + def _construct_density(self): + + precs = {} + S = {} + r = {} + + p = self.target_score_cov.shape[1] + + for m in range(self.ntarget): + observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + prec_target = 1. / target_cov_uni + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + + target_linear = target_score_cov_uni.T.dot(prec_target) + target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( + (target_linear.shape[0],)) + + target_lin = -self.logdens_linear.dot(target_linear) + target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + + _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + self.prec_opt).dot(target_lin) + + _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _S = np.linalg.inv(_prec).dot(prec_target) + + S[m] = _S + r[m] = _r + precs[m] = _prec + + self.precs = precs + self.S = S + self.r = r + + + + diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 62c83f7fa..2c942f89d 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -93,10 +93,11 @@ def test_approx_pivot(n=500, Y, W, ridge_term=0., - randomizer_scale=randomizer_scale * dispersion) + randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 + print("no of variables selected ", nonzero.sum()) if nonzero.sum() > 0: beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -150,7 +151,7 @@ def test_approx_ci(n=500, conv = const(X, Y, W, - randomizer_scale=randomizer_scale * dispersion) + randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 @@ -165,7 +166,6 @@ def test_approx_ci(n=500, nonzero, dispersion=dispersion) - ntarget = observed_target.shape[0] result, inverse_info = conv.selective_MLE(observed_target, cov_target, cov_target_score)[:2] @@ -203,12 +203,12 @@ def main(nsim=300, CI = False): if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=400, + _pivot.extend(test_approx_pivot(n=500, p=100, - signal_fac=1., + signal_fac=0.5, s=0, - sigma=1., - rho=0.30, + sigma=2., + rho=0.50, randomizer_scale=1.)) print("iteration completed ", i) @@ -239,4 +239,4 @@ def main(nsim=300, CI = False): print("iteration completed ", n + 1) if __name__ == "__main__": - main(nsim=50, CI = False) + main(nsim=20, CI = False) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index c023b0d65..18a061344 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -12,167 +12,87 @@ def test_approx_pivot(n=500, rho=0.4, randomizer_scale=1.): - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=0, - s=s, - equicorrelated=True, - rho=rho, - sigma=sigma, - random_signs=False)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - #dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - dispersion = sigma_ ** 2 - - #W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) - eps = np.random.standard_normal((n, 2000)) * Y.std() - lam_theory = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) - - conv = const(X, - Y, - lam_theory * np.ones(p), - randomizer_scale=randomizer_scale * dispersion) - - signs = conv.fit() - nonzero = signs != 0 - print("size of selected set ", nonzero.sum()) - - if nonzero.sum()>0: - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=None) - - exact_grid_inf = exact_grid_inference(conv, - observed_target, - cov_target, - cov_target_score) - - pivot = exact_grid_inf._pivots(beta_target) - - return pivot - -def test_approx_ci(n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - level=0.9): - - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * np.sqrt(dispersion) - - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * dispersion) - - signs = conv.fit() - nonzero = signs != 0 - - if nonzero.sum()>0: - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - result, inverse_info = conv.selective_MLE(observed_target, + while True: + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=True, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + + if n > (2 * p): + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = sigma_ ** 2 + + eps = np.random.standard_normal((n, 2000)) * Y.std() + W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) + + conv = const(X, + Y, + W, + ridge_term=0.) + #randomizer_scale=randomizer_scale * np.sqrt(dispersion)) + + signs = conv.fit() + nonzero = signs != 0 + print("size of selected set ", nonzero.sum()) + + if nonzero.sum() > 0: + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + exact_grid_inf = exact_grid_inference(conv, + observed_target, cov_target, - cov_target_score)[:2] + cov_target_score) - exact_grid_inf = exact_grid_inference(conv, - observed_target, - cov_target, - cov_target_score) + pivot = exact_grid_inf._pivots(beta_target) - lci, uci = exact_grid_inf._intervals(level) + return pivot - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - coverage = (lci < beta_target) * (uci > beta_target) - length = uci - lci - - return np.mean(coverage), np.mean(length), np.mean(length-(3.3 * np.sqrt(np.diag(inverse_info)))) - -def main(nsim=300, CI=False): +def main(nsim=300): import matplotlib as mpl mpl.use('tkagg') import matplotlib.pyplot as plt from statsmodels.distributions.empirical_distribution import ECDF - if CI is False: - _pivot = [] - for i in range(nsim): - _pivot.extend(test_approx_pivot(n=100, - p=400, - signal_fac=1., - s=0, - sigma=1., - rho=0.30, - randomizer_scale=0.7)) - - print("iteration completed ", i) - - plt.clf() - ecdf_pivot = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') - plt.plot(grid, grid, 'k--') - plt.show() - - if CI is True: - coverage_ = 0. - length_ = 0. - length_diff_ = 0. - for n in range(nsim): - cov, len, len_diff = test_approx_ci(n=500, - p=100, - signal_fac=1., - s=5, - sigma=3., - rho=0.50, - randomizer_scale=1.) - - coverage_ += cov - length_ += len - length_diff_ += len_diff - print("coverage so far ", coverage_ / (n + 1.)) - print("lengths so far ", length_ / (n + 1.), length_diff_/(n+1.)) - print("iteration completed ", n + 1) - + _pivot = [] + for i in range(nsim): + _pivot.extend(test_approx_pivot(n=400, + p=100, + signal_fac=0.5, + s=0, + sigma=1., + rho=0.30, + randomizer_scale=1.)) + + print("iteration completed ", i) + + plt.clf() + ecdf_pivot = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() if __name__ == "__main__": - main(nsim=50, CI=False) \ No newline at end of file + main(nsim=100) \ No newline at end of file From 43d78d2243a661e31a9e0bf451da81fc3815f893 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 14 Jun 2021 13:01:30 -0400 Subject: [PATCH 095/187] added option to use interp1d --- selectinf/randomized/approx_reference.py | 56 ++++++++++++++++-------- selectinf/randomized/exact_reference.py | 51 ++++++++++++++------- 2 files changed, 73 insertions(+), 34 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 62ab28f56..4c14dfad8 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -14,7 +14,8 @@ def __init__(self, observed_target, target_cov, target_score_cov, - solve_args={'tol': 1.e-12}): + solve_args={'tol': 1.e-12}, + useIP=False): """ Produce p-values and confidence intervals for targets @@ -60,15 +61,24 @@ def __init__(self, self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) - ngrid = 60 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) + if useIP == False: + ngrid = 1000 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + else: + ngrid = 60 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) self.opt_linear = query.opt_linear + self.useIP = useIP def summary(self, alternatives=None, @@ -168,21 +178,29 @@ def _construct_families(self): target_score_cov_uni, self.stat_grid[m]) - approx_fn = interp1d(self.stat_grid[m], - approx_log_ref, - kind='quadratic', - bounds_error=False, - fill_value='extrapolate') - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) - logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) - logW -= logW.max() + if self.useIP == False: + logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + self._families.append(discrete_family(self.stat_grid[m], + np.exp(logW))) + else: + approx_fn = interp1d(self.stat_grid[m], + approx_log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') - # construction of families follows `selectinf.learning.core` + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) - self._families.append(discrete_family(grid, - np.exp(logW))) + logW -= logW.max() + self._families.append(discrete_family(grid, + np.exp(logW))) + + + # construction of families follows `selectinf.learning.core` # logG = - 0.5 * grid**2 / var_target # logG -= logG.max() diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 96fab032e..80169a9a0 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -13,7 +13,8 @@ def __init__(self, observed_target, target_cov, target_score_cov, - solve_args={'tol': 1.e-12}): + solve_args={'tol': 1.e-12}, + useIP=False): """ Produce p-values and confidence intervals for targets @@ -57,15 +58,24 @@ def __init__(self, self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) - ngrid = 1000 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) + if useIP == False: + ngrid = 1000 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + else: + ngrid = 60 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) self.opt_linear = query.opt_linear + self.useIP = useIP def summary(self, alternatives=None, @@ -199,14 +209,25 @@ def _construct_families(self): target_cov_uni, target_score_cov_uni, self.stat_grid[m]) - - logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) - logW -= logW.max() - - # construction of families follows `selectinf.learning.core` - - self._families.append(discrete_family(self.stat_grid[m], - np.exp(logW))) + if self.useIP == False: + logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + self._families.append(discrete_family(self.stat_grid[m], + np.exp(logW))) + else: + approx_fn = interp1d(self.stat_grid[m], + log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') + + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + + logW -= logW.max() + self._families.append(discrete_family(grid, + np.exp(logW))) def _pivots(self, mean_parameter, From 0b623f9feb7bbd5f48c10b05b70b60c50b6f56b8 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 14 Jun 2021 13:02:02 -0400 Subject: [PATCH 096/187] updated tests --- .../randomized/tests/test_approx_reference.py | 26 +++++++++++-------- .../randomized/tests/test_exact_reference.py | 19 +++++++++----- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 2c942f89d..a7233a123 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -61,7 +61,9 @@ def test_approx_pivot(n=500, s=5, sigma=2., rho=0.4, - randomizer_scale=1.): + randomizer_scale=1., + equicorrelated=False, + useIP=False): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -72,7 +74,7 @@ def test_approx_pivot(n=500, p=p, signal=signal, s=s, - equicorrelated=True, + equicorrelated=equicorrelated, rho=rho, sigma=sigma, random_signs=False)[:3] @@ -86,14 +88,13 @@ def test_approx_pivot(n=500, dispersion = sigma_ ** 2 eps = np.random.standard_normal((n, 2000)) * Y.std() - lam_theory = 0.6 * np.median(np.abs(X.T.dot(eps)).max(1)) - W = lam_theory * np.ones(p) + W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) conv = const(X, Y, W, - ridge_term=0., - randomizer_scale=randomizer_scale * sigma_) + ridge_term=0.) + #randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 @@ -113,7 +114,8 @@ def test_approx_pivot(n=500, approximate_grid_inf = approximate_grid_inference(conv, observed_target, cov_target, - cov_target_score) + cov_target_score, + useIP=useIP) pivot = approximate_grid_inf._approx_pivots(beta_target) @@ -203,13 +205,15 @@ def main(nsim=300, CI = False): if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=500, - p=100, + _pivot.extend(test_approx_pivot(n=100, + p=400, signal_fac=0.5, s=0, sigma=2., - rho=0.50, - randomizer_scale=1.)) + rho=0.30, + randomizer_scale=1., + equicorrelated=True, + useIP=True)) print("iteration completed ", i) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 18a061344..ddeb6cee4 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -10,7 +10,9 @@ def test_approx_pivot(n=500, s=5, sigma=2., rho=0.4, - randomizer_scale=1.): + randomizer_scale=1., + equicorrelated=False, + useIP=False): while True: @@ -21,7 +23,7 @@ def test_approx_pivot(n=500, p=p, signal=signal, s=s, - equicorrelated=True, + equicorrelated=equicorrelated, rho=rho, sigma=sigma, random_signs=True)[:3] @@ -62,7 +64,8 @@ def test_approx_pivot(n=500, exact_grid_inf = exact_grid_inference(conv, observed_target, cov_target, - cov_target_score) + cov_target_score, + useIP=useIP) pivot = exact_grid_inf._pivots(beta_target) @@ -77,20 +80,22 @@ def main(nsim=300): _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=400, - p=100, + _pivot.extend(test_approx_pivot(n=100, + p=400, signal_fac=0.5, s=0, sigma=1., rho=0.30, - randomizer_scale=1.)) + randomizer_scale=1., + equicorrelated=True, + useIP=False)) print("iteration completed ", i) plt.clf() ecdf_pivot = ECDF(np.asarray(_pivot)) grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_pivot(grid), c='blue', marker='^') + plt.plot(grid, ecdf_pivot(grid), c='blue') plt.plot(grid, grid, 'k--') plt.show() From 58792c80666943b755df3a93ad4247d68e981d26 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 14 Jun 2021 13:31:59 -0400 Subject: [PATCH 097/187] added barrier affine --- selectinf/algorithms/barrier_affine.py | 139 +++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 selectinf/algorithms/barrier_affine.py diff --git a/selectinf/algorithms/barrier_affine.py b/selectinf/algorithms/barrier_affine.py new file mode 100644 index 000000000..88812c278 --- /dev/null +++ b/selectinf/algorithms/barrier_affine.py @@ -0,0 +1,139 @@ +import numpy as np + +def solve_barrier_affine_py(conjugate_arg, + precision, + feasible_point, + con_linear, + con_offset, + step=1, + nstep=1000, + min_its=200, + tol=1.e-10): + + scaling = np.sqrt(np.diag(con_linear.dot(precision).dot(con_linear.T))) + + if feasible_point is None: + feasible_point = 1. / scaling + + objective = lambda u: -u.T.dot(conjugate_arg) + u.T.dot(precision).dot(u)/2. \ + + np.log(1.+ 1./((con_offset - con_linear.dot(u))/ scaling)).sum() + grad = lambda u: -conjugate_arg + precision.dot(u) - con_linear.T.dot(1./(scaling + con_offset - con_linear.dot(u)) - + 1./(con_offset - con_linear.dot(u))) + barrier_hessian = lambda u: con_linear.T.dot(np.diag(-1./((scaling + con_offset-con_linear.dot(u))**2.) + + 1./((con_offset-con_linear.dot(u))**2.))).dot(con_linear) + + current = feasible_point + current_value = np.inf + + for itercount in range(nstep): + cur_grad = grad(current) + + # make sure proposal is feasible + + count = 0 + while True: + count += 1 + proposal = current - step * cur_grad + if np.all(con_offset-con_linear.dot(proposal) > 0): + break + step *= 0.5 + if count >= 40: + raise ValueError('not finding a feasible point') + + # make sure proposal is a descent + + count = 0 + while True: + count += 1 + proposal = current - step * cur_grad + proposed_value = objective(proposal) + if proposed_value <= current_value: + break + step *= 0.5 + if count >= 20: + if not (np.isnan(proposed_value) or np.isnan(current_value)): + break + else: + raise ValueError('value is NaN: %f, %f' % (proposed_value, current_value)) + + # stop if relative decrease is small + + if np.fabs(current_value - proposed_value) < tol * np.fabs(current_value) and itercount >= min_its: + current = proposal + current_value = proposed_value + break + + current = proposal + current_value = proposed_value + + if itercount % 4 == 0: + step *= 2 + + hess = np.linalg.inv(precision + barrier_hessian(current)) + return current_value, current, hess + +def solve_barrier_nonneg(conjugate_arg, + precision, + feasible_point=None, + step=1, + nstep=1000, + tol=1.e-8): + + scaling = np.sqrt(np.diag(precision)) + + if feasible_point is None: + feasible_point = 1. / scaling + + objective = lambda u: -u.T.dot(conjugate_arg) + u.T.dot(precision).dot(u) / 2. + np.log( + 1. + 1. / (u / scaling)).sum() + grad = lambda u: -conjugate_arg + precision.dot(u) + (1. / (scaling + u) - 1. / u) + barrier_hessian = lambda u: (-1. / ((scaling + u) ** 2.) + 1. / (u ** 2.)) + + current = feasible_point + current_value = np.inf + + for itercount in range(nstep): + cur_grad = grad(current) + + # make sure proposal is feasible + + count = 0 + while True: + count += 1 + proposal = current - step * cur_grad + if np.all(proposal > 0): + break + step *= 0.5 + if count >= 40: + raise ValueError('not finding a feasible point') + + # make sure proposal is a descent + + count = 0 + while True: + proposal = current - step * cur_grad + proposed_value = objective(proposal) + if proposed_value <= current_value: + break + step *= 0.5 + if count >= 20: + if not (np.isnan(proposed_value) or np.isnan(current_value)): + break + else: + raise ValueError('value is NaN: %f, %f' % (proposed_value, current_value)) + + # stop if relative decrease is small + + if np.fabs(current_value - proposed_value) < tol * np.fabs(current_value): + current = proposal + current_value = proposed_value + break + + current = proposal + current_value = proposed_value + + if itercount % 4 == 0: + step *= 2 + + hess = np.linalg.inv(precision + np.diag(barrier_hessian(current))) + return current_value, current, hess From 9011fcc59e4b160897a77cd90d4329a9f961608a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 17 Jun 2021 09:20:56 -0400 Subject: [PATCH 098/187] fixed a sign --- selectinf/randomized/posterior_inference.py | 12 ++++++------ selectinf/randomized/query.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index ef2d184a5..85dc64b5e 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -109,8 +109,8 @@ def log_posterior(self, log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal) / 2. - log_lik = -(((self.observed_target - target).T.dot(self._prec).dot( - self.observed_target - target)) / 2. - log_normalizer) + log_lik = -((self.observed_target - target).T.dot(self._prec).dot(self.observed_target - target)) / 2. \ + - log_normalizer grad_lik = self.S.T.dot(self._prec.dot(self.observed_target) - self._prec.dot(target) - self.linear_coef.T.dot( prec_marginal.dot(soln) - conjugate_marginal)) @@ -137,7 +137,7 @@ def _set_marginal_parameters(self): target_off = self.cond_mean - target_lin.dot(self.observed_target) self.linear_coef = target_lin - self.offset_coef = self.cond_mean - target_lin.dot(self.observed_target) + self.offset_coef = target_off if np.asarray(self.randomizer_prec).shape in [(), (0,)]: _prec = self.prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) \ @@ -149,15 +149,14 @@ def _set_marginal_parameters(self): _P = target_linear.T.dot(self.randomizer_prec).dot(target_offset) _Q = np.linalg.inv(_prec + target_lin.T.dot(self.cond_precision).dot(target_lin)) - self.prec_marginal = self.cond_precision - self.cond_precision.dot(target_lin).dot(_Q).dot(target_lin.T).dot( - self.cond_precision) + self.prec_marginal = self.cond_precision - self.cond_precision.dot(target_lin).dot(_Q).dot(target_lin.T).dot(self.cond_precision) r = np.linalg.inv(_prec).dot(target_lin.T.dot(self.cond_precision).dot(target_off) - _P) S = np.linalg.inv(_prec).dot(self.prec_target) self.r = r self.S = S - # print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) + #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self._prec = _prec @@ -185,6 +184,7 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i, :] = sample.copy() + #print("sample ", i, samples[i,:]) if i == nsample - 1: break diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 05afbcd8e..97f9b3e70 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -15,7 +15,7 @@ from .posterior_inference import posterior from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from .approx_reference import approximate_grid_inference - +from ..algorithms.barrier_affine import solve_barrier_affine_py class query(object): r""" @@ -1433,7 +1433,7 @@ def selective_MLE(observed_target, if useC: solver = solve_barrier_affine_C else: - solver = _solve_barrier_affine_py + solver = solve_barrier_affine_py val, soln, hess = solver(conjugate_arg, prec_opt, From 9e47c0587483d143148f06f80f69f0d26fc02ed0 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 27 Jun 2021 14:14:01 -0400 Subject: [PATCH 099/187] update to test --- selectinf/randomized/query.py | 1 + selectinf/randomized/tests/test_exact_reference.py | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 97f9b3e70..8d6fb2da8 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -135,6 +135,7 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): opt_offset, cond_precision) + _, randomizer_prec = self.randomizer.cov_prec self.cond_mean, self.cond_cov, self.randomizer_prec = cond_mean, cond_cov, randomizer_prec diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index ddeb6cee4..7cb49ff11 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -43,8 +43,7 @@ def test_approx_pivot(n=500, conv = const(X, Y, W, - ridge_term=0.) - #randomizer_scale=randomizer_scale * np.sqrt(dispersion)) + randomizer_scale=randomizer_scale * np.sqrt(dispersion)) signs = conv.fit() nonzero = signs != 0 @@ -82,11 +81,11 @@ def main(nsim=300): for i in range(nsim): _pivot.extend(test_approx_pivot(n=100, p=400, - signal_fac=0.5, + signal_fac=1., s=0, - sigma=1., + sigma=2., rho=0.30, - randomizer_scale=1., + randomizer_scale=0.7, equicorrelated=True, useIP=False)) From 05d08e959c7c170167b07c6c0f6720b6d7aa7861 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 27 Jun 2021 23:17:52 -0400 Subject: [PATCH 100/187] modified mle and reference code for group lasso --- .../randomized/approx_reference_grouplasso.py | 264 +++++++++++++----- .../tests/test_approx_reference_grouplasso.py | 79 +++--- 2 files changed, 232 insertions(+), 111 deletions(-) diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index f028fcbe3..d4c0decdd 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -12,7 +12,7 @@ import regreg.api as rr from .randomization import randomization from ..base import restricted_estimator -from .query import _solve_barrier_affine_py +from ..algorithms.barrier_affine import solve_barrier_affine_py as solver from ..distributions.discrete_family import discrete_family class group_lasso(object): @@ -75,6 +75,8 @@ def fit(self, tol = 1.e-20 + _, self.randomizer_prec = self.randomizer.cov_prec + # now we are collecting the directions and norms of the active groups for g in sorted(np.unique(self.groups)): # g is group label @@ -175,8 +177,6 @@ def compute_Lg(g): self.linear_part = -np.eye(self.observed_opt_state.shape[0]) self.offset = np.zeros(self.observed_opt_state.shape[0]) - # print("K.K.T. map", np.allclose(self._initial_omega, self.observed_score_state + self.opt_linear.dot(self.observed_opt_state) - # + self.opt_offset, rtol=1e-03)) return active_signs, soln def _solve_randomized_problem(self, @@ -302,15 +302,32 @@ def selective_MLE(self, observed_target = np.atleast_1d(observed_target) prec_target = inv(target_cov) + prec_opt = self.cond_precision + + score_offset = self.observed_score_state + self.opt_offset + # target_lin determines how the conditional mean of optimization variables # vary with target # logdens_linear determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - target_lin = - logdens_linear.dot(target_score_cov.T.dot(prec_target)) - target_offset = cond_mean - target_lin.dot(observed_target) + target_linear = target_score_cov.T.dot(prec_target) + target_offset = score_offset - target_linear.dot(observed_target) - prec_opt = self.cond_precision + target_lin = - logdens_linear.dot(target_linear) + target_off = cond_mean - target_lin.dot(observed_target) + + if np.asarray(self.randomizer_prec).shape in [(), (0,)]: + _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + prec_opt).dot( + target_lin) + else: + _P = target_linear.T.dot(self.randomizer_prec).dot(target_offset) + _prec = prec_target + (target_linear.T.dot(self.randomizer_prec).dot(target_linear)) - target_lin.T.dot( + prec_opt).dot(target_lin) + + C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) conjugate_arg = prec_opt.dot(cond_mean) @@ -324,23 +341,32 @@ def selective_MLE(self, useJacobian, **solve_args) - log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. + final_estimator = target_cov.dot(_prec).dot(observed_target) \ + + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) + C + + unbiased_estimator = target_cov.dot(_prec).dot(observed_target) + target_cov.dot( + _P - target_lin.T.dot(prec_opt).dot(target_off)) - final_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) - ind_unbiased_estimator = observed_target + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - - init_soln))) L = target_lin.T.dot(prec_opt) - observed_info_natural = prec_target + L.dot(target_lin) - L.dot(hess.dot(L.T)) + observed_info_natural = _prec + L.dot(target_lin) - L.dot(hess.dot(L.T)) + observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) + pvalues = ndist.cdf(Z_scores) + pvalues = 2 * np.minimum(pvalues, 1 - pvalues) - alpha = 1. - level + alpha = 1 - level quantile = ndist.ppf(1 - alpha / 2.) - intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), - final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T + + intervals = np.vstack([final_estimator - + quantile * np.sqrt(np.diag(observed_info_mean)), + final_estimator + + quantile * np.sqrt(np.diag(observed_info_mean))]).T + + log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. result = pd.DataFrame({'MLE': final_estimator, 'SE': np.sqrt(np.diag(observed_info_mean)), @@ -348,7 +374,7 @@ def selective_MLE(self, 'pvalue': pvalues, 'lower_confidence': intervals[:, 0], 'upper_confidence': intervals[:, 1], - 'unbiased': ind_unbiased_estimator}) + 'unbiased': unbiased_estimator}) return result, observed_info_mean, log_ref @@ -383,7 +409,8 @@ class approximate_grid_inference(object): def __init__(self, query, dispersion, - solve_args={'tol': 1.e-12}): + solve_args={'tol': 1.e-12}, + useIP=True): """ Produce p-values and confidence intervals for targets @@ -407,12 +434,6 @@ def __init__(self, result, inverse_info = query.selective_MLE(dispersion=dispersion)[:2] - (observed_target, target_cov, target_score_cov, alternatives) = query.selected_targets(dispersion) - - self.observed_target = observed_target - self.target_score_cov = target_score_cov - self.target_cov = target_cov - self.linear_part = query.linear_part self.offset = query.offset @@ -423,17 +444,37 @@ def __init__(self, self.C = query.C self.active_dirs = query.active_dirs + (observed_target, target_cov, target_score_cov, alternatives) = query.selected_targets(dispersion) + self.observed_target = observed_target + self.target_score_cov = target_score_cov + self.target_cov = target_cov + self.init_soln = query.observed_opt_state + self.randomizer_prec = query.randomizer_prec + self.score_offset = query.observed_score_state + query.opt_offset + self.ntarget = ntarget = target_cov.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) - ngrid = 40 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) + if useIP == False: + ngrid = 1000 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + else: + ngrid = 100 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + + self.opt_linear = query.opt_linear + self.useIP = useIP + def summary(self, alternatives=None, parameter=None, @@ -453,7 +494,7 @@ def summary(self, """ if parameter is not None: - pivots = self.approx_pivots(parameter, + pivots = self._approx_pivots(parameter, alternatives=alternatives) else: pivots = None @@ -473,15 +514,16 @@ def summary(self, return result - def _approx_log_reference(self, - observed_target, - target_cov, - target_score_cov, - grid): + def log_reference(self, + observed_target, + target_cov, + target_score_cov, + grid): """ Approximate the log of the reference density on a grid. """ + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') @@ -489,58 +531,88 @@ def _approx_log_reference(self, target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] - solver = _solve_barrier_affine_py for k in range(grid.shape[0]): + # in the usual D = N + Gamma theta.hat, + # target_lin is "something" times Gamma, + # where "something" comes from implied Gaussian + # cond_mean is "something" times D + # Gamma is target_score_cov.T.dot(prec_target) + + num_opt = self.prec_opt.shape[0] + num_con = self.linear_part.shape[0] cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) - conjugate_arg = self.prec_opt.dot(cond_mean_grid) - val, soln, _ = solver(conjugate_arg, - self.prec_opt, - self.init_soln, - self.linear_part, - self.offset, - **self.solve_args) + #direction for decomposing o - log_jacob = jacobian_grad_hess(soln, self.C, self.active_dirs) + eta = -self.prec_opt.dot(self.logdens_linear.dot(target_score_cov.T)) - ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.) + log_jacob[0]) + implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) + implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) + implied_prec = 1./implied_cov + + _A = self.cond_cov.dot(eta) * implied_prec + R = np.identity(num_opt) - _A.dot(eta.T) + + A = self.linear_part.dot(_A).reshape((-1,)) + b = self.offset-self.linear_part.dot(R).dot(self.init_soln) + + conjugate_arg = implied_mean * implied_prec + + val, soln, _ = solver(np.asarray([conjugate_arg]), + np.reshape(implied_prec, (1,1)), + eta.T.dot(self.init_soln), + A.reshape((A.shape[0],1)), + b, + **self.solve_args) + + gamma_ = _A.dot(soln) + R.dot(self.init_soln) + log_jacob = jacobian_grad_hess(gamma_, self.C, self.active_dirs) + + ref_hat.append(-val - ((conjugate_arg ** 2) * implied_cov)/ 2. + log_jacob[0]) return np.asarray(ref_hat) def _construct_families(self): + self._construct_density() + self._families = [] + for m in range(self.ntarget): p = self.target_score_cov.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - var_target = target_cov_uni[0, 0] target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) - approx_log_ref = self._approx_log_reference(observed_target_uni, - target_cov_uni, - target_score_cov_uni, - self.stat_grid[m]) - - approx_fn = interp1d(self.stat_grid[m], - approx_log_ref, - kind='quadratic', - bounds_error=False, - fill_value='extrapolate') - - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) - logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) - logW -= logW.max() - - # construction of families follows `selectinf.learning.core` + var_target = 1. / ((self.precs[m])[0, 0]) + + log_ref = self.log_reference(observed_target_uni, + target_cov_uni, + target_score_cov_uni, + self.stat_grid[m]) + if self.useIP == False: + logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + self._families.append(discrete_family(self.stat_grid[m], + np.exp(logW))) + else: + approx_fn = interp1d(self.stat_grid[m], + log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') - self._families.append(discrete_family(grid, - np.exp(logW))) + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + self._families.append(discrete_family(grid, + np.exp(logW))) def _approx_pivots(self, mean_parameter, @@ -555,15 +627,15 @@ def _approx_pivots(self, pivot = [] for m in range(self.ntarget): - print("variable computed ", m) + family = self._families[m] - observed_target = self.observed_target[m] - var_target = self.target_cov[m, m] + var_target = 1. / ((self.precs[m])[0, 0]) - # construction of pivot from families follows `selectinf.learning.core` + mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] + + _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) + print("variable completed ", m) - _cdf = family.cdf((mean_parameter[m] - observed_target) / var_target, - x=observed_target) if alternatives[m] == 'twosided': pivot.append(2 * min(_cdf, 1 - _cdf)) elif alternatives[m] == 'greater': @@ -575,7 +647,7 @@ def _approx_pivots(self, return pivot def _approx_intervals(self, - level=0.9): + level=0.9): if not hasattr(self, "_families"): self._construct_families() @@ -586,14 +658,54 @@ def _approx_intervals(self, # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] observed_target = self.observed_target[m] + l, u = family.equal_tailed_interval(observed_target, alpha=1 - level) - var_target = self.target_cov[m, m] + + var_target = 1. / ((self.precs[m])[0, 0]) + lower.append(l * var_target + observed_target) upper.append(u * var_target + observed_target) return np.asarray(lower), np.asarray(upper) + ### Private method + def _construct_density(self): + + precs = {} + S = {} + r = {} + + p = self.target_score_cov.shape[1] + + for m in range(self.ntarget): + observed_target_uni = (self.observed_target[m]).reshape((1,)) + target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) + prec_target = 1. / target_cov_uni + target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + + target_linear = target_score_cov_uni.T.dot(prec_target) + target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( + (target_linear.shape[0],)) + + target_lin = -self.logdens_linear.dot(target_linear) + target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + + _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + self.prec_opt).dot(target_lin) + + _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _S = np.linalg.inv(_prec).dot(prec_target) + + S[m] = _S + r[m] = _r + precs[m] = _prec + + self.precs = precs + self.S = S + self.r = r + def solve_barrier_affine_jacobian_py(conjugate_arg, precision, @@ -719,13 +831,15 @@ def jacobian_grad_hess(gamma, C, active_dirs): GammaMinus = calc_GammaMinus(gamma, active_dirs) # eigendecomposition - evalues, evectors = eig(GammaMinus + C) + #evalues, evectors = eig(GammaMinus + C) # log Jacobian - J = log(evalues).sum() + #J = log(evalues).sum() + J = np.log(np.linalg.det(GammaMinus + C)) # inverse - GpC_inv = evectors.dot(np.diag(1 / evalues).dot(evectors.T)) + #GpC_inv = evectors.dot(np.diag(1 / evalues).dot(evectors.T)) + GpC_inv = np.linalg.inv(GammaMinus + C) # summing matrix (gamma.size by C.shape[0]) S = block_diag(*[np.ones((1, ug.size - 1)) for ug in active_dirs.values()]) diff --git a/selectinf/randomized/tests/test_approx_reference_grouplasso.py b/selectinf/randomized/tests/test_approx_reference_grouplasso.py index 0b4f53474..5228a410a 100644 --- a/selectinf/randomized/tests/test_approx_reference_grouplasso.py +++ b/selectinf/randomized/tests/test_approx_reference_grouplasso.py @@ -11,54 +11,61 @@ def test_approx_pivot(n=500, sigma=3., rho=0.3, randomizer_scale=1, - weight_frac=1.2): + weight_frac=1.5): - inst, const = gaussian_group_instance, group_lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) + while True: - X, Y, beta = inst(n=n, - p=p, - signal=signal, - sgroup=sgroup, - groups=groups, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] + inst, const = gaussian_group_instance, group_lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) - n, p = X.shape + X, Y, beta = inst(n=n, + p=p, + signal=signal, + sgroup=sgroup, + groups=groups, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] - sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + n, p = X.shape - penalty_weights = dict([(i, weight_frac * sigma_ * np.sqrt(2 * np.log(p))) for i in np.unique(groups)]) + sigma_ = np.std(Y) - conv = const(X, - Y, - groups, - penalty_weights, - randomizer_scale=randomizer_scale * dispersion) + if n > (2 * p): + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = sigma_ ** 2 - signs, _ = conv.fit() - nonzero = signs != 0 - print("number of selected variables ", nonzero.sum()) + penalty_weights = dict([(i, weight_frac * sigma_ * np.sqrt(2 * np.log(p))) for i in np.unique(groups)]) - if nonzero.sum()>0: + conv = const(X, + Y, + groups, + penalty_weights, + randomizer_scale=randomizer_scale * np.sqrt(dispersion)) - conv._setup_implied_gaussian() + signs, _ = conv.fit() + nonzero = signs != 0 + print("number of selected variables ", nonzero.sum()) - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + if nonzero.sum() > 0: + conv._setup_implied_gaussian() - approximate_grid_inf = approximate_grid_inference(conv, - dispersion) + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - pivot = approximate_grid_inf._approx_pivots(beta_target) + approximate_grid_inf = approximate_grid_inference(conv, + dispersion) - return pivot + pivot = approximate_grid_inf._approx_pivots(beta_target) + + return pivot def main(nsim=300, CI = False): + import matplotlib as mpl + mpl.use('tkagg') import matplotlib.pyplot as plt from statsmodels.distributions.empirical_distribution import ECDF if CI is False: @@ -66,13 +73,13 @@ def main(nsim=300, CI = False): for i in range(nsim): _pivot.extend(test_approx_pivot(n=500, p=100, - signal_fac=0.3, - sgroup=3, - groups=np.arange(20).repeat(5), - sigma=1., + signal_fac=1., + sgroup=0, + groups=np.arange(25).repeat(4), + sigma=2., rho=0.20, randomizer_scale=0.5, - weight_frac=1.)) + weight_frac=1.2)) print("iteration completed ", i) From ccfeb0fb5364e4a7984def456c6e2e576e15fbb2 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 6 Jul 2021 11:13:04 -0700 Subject: [PATCH 101/187] approx reference test --- selectinf/randomized/tests/test_approx_reference.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index b1bb3c8fc..bbfe4b719 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -181,16 +181,11 @@ def test_approx_ci(n=500, scale_ = np.max(_scale) ngrid = int(2 * scale_/0.1) - approximate_grid_inf = approximate_grid_inference(observed_target, + approximate_grid_inf = approximate_grid_inference(conv, + observed_target, cov_target, cov_target_score, - inverse_info, - conv.observed_opt_state, - conv.sampler.affine_con.mean, - conv.sampler.affine_con.covariance, - conv.sampler.logdens_transform[0], - conv.sampler.affine_con.linear_part, - conv.sampler.affine_con.offset) + useIP=False) lci, uci = approximate_grid_inf._approx_intervals(level) From d753a92b006bf62fa913dd7e20a365bbf013cd99 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 6 Jul 2021 11:26:06 -0700 Subject: [PATCH 102/187] removing unused code --- selectinf/randomized/query.py | 119 ---------------------------------- 1 file changed, 119 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index aefe70698..592065367 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1433,123 +1433,4 @@ def selective_MLE(observed_target, return result, observed_info_mean, log_ref -def normalizing_constant(target_parameter, - observed_target, - target_cov, - target_score_cov, - feasible_point, - cond_mean, - cond_cov, - logdens_linear, - linear_part, - offset, - useC=False): - """ - Approximation of normalizing constant - in affine constrained Gaussian. - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - target_cov : ndarray - Estimated covaraince of target. - - target_score_cov : ndarray - Estimated covariance of target and score of randomized query. - - init_soln : ndarray - Feasible point for optimization problem. - cond_mean : ndarray - Conditional mean of optimization variables given target. - cond_cov : ndarray - Conditional covariance of optimization variables given target. - - logdens_linear : ndarray - Describes how conditional mean of optimization - variables varies with target. - - linear_part : ndarray - Linear part of affine constraints: $\{o:Ao \leq b\}$ - offset : ndarray - Offset part of affine constraints: $\{o:Ao \leq b\}$ - solve_args : dict, optional - Arguments passed to solver. - level : float, optional - Confidence level. - useC : bool, optional - Use python or C solver. - """ - - target_parameter = np.atleast_1d(target_parameter) - - cond_precision = np.linalg.inv(cond_cov) - prec_target = np.linalg.inv(target_cov) - target_linear = -logdens_linear.dot(target_score_cov.dot(prec_target)) - nuisance_correction = target_linear.dot(observed_target) - corrected_mean = cond_mean - nuisance_correction - - # rest of the objective is the target mahalanobis distance - # plus the mahalanobis distance for optimization variables - # this includes a term linear in the target, i.e. - # the source of `target_linear` - - ntarget = target_cov.shape[0] - nopt = cond_cov.shape[0] - full_Q = np.zeros((ntarget + nopt, - ntarget + nopt)) - full_Q[:ntarget][:, :ntarget] = (prec_target + target_linear.T.dot(cond_precision.dot(target_linear))) - full_Q[:ntarget][:, ntarget:] = -target_linear.dot(cond_precision) - full_Q[ntarget:][:, :ntarget] = (-target_linear.dot(cond_precision)).T - full_Q[ntarget:][:, ntarget:] = cond_precision - - linear_term = np.hstack([-prec_target.dot(target_parameter) + - corrected_mean.dot(cond_precision).dot(target_linear), - -cond_precision.dot(corrected_mean)]) - - constant_term = 0.5 * (np.sum(target_parameter * prec_target.dot(target_parameter)) + - np.sum(corrected_mean * cond_precision.dot(corrected_mean))) - - full_con_linear = np.zeros((linear_part.shape[0], - ntarget + nopt)) - full_con_linear[:, ntarget:] = linear_part - full_feasible = np.zeros(ntarget + nopt) - full_feasible[ntarget:] = feasible_point - - solve_args = {'tol': 1.e-12} - - if useC: - solver = solve_barrier_affine_C - else: - solver = _solve_barrier_affine_py - - value, soln, hess = solver(-linear_term, - full_Q, - full_feasible, - full_con_linear, - offset, - **solve_args) - return (-value + 0.5 * np.sum(target_parameter * prec_target.dot(target_parameter)), - soln[:ntarget], - hess[:ntarget][:, :ntarget]) - - -def _bisect(f, lb, ub, min_iter=20, max_iter=100, tol=1.e-3): - while True: - sign_l = np.sign(f(lb)) - sign_u = np.sign(f(ub)) - mid = 0.5 * (lb + ub) - f_mid = f(mid) - if sign_l == 1: - if f_mid > 0: # we should move closer to upper - lb = mid - else: - ub = mid - else: - if f_mid > 0: # we should move closer to lower - ub = mid - else: - lb = mid - if np.fabs(f_mid) < tol: - break - return mid From 13994bfabb3b05b9df988a1d2e217865c75e446d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 13:04:11 -0700 Subject: [PATCH 103/187] renaming logdens_linear --- selectinf/randomized/approx_reference.py | 8 +-- .../randomized/approx_reference_grouplasso.py | 26 ++++---- selectinf/randomized/exact_reference.py | 8 +-- selectinf/randomized/group_lasso.py | 8 +-- selectinf/randomized/lasso.py | 10 +-- selectinf/randomized/posterior_inference.py | 6 +- selectinf/randomized/query.py | 64 ++++++++++--------- 7 files changed, 66 insertions(+), 64 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 4c14dfad8..ee8d81391 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -45,7 +45,7 @@ def __init__(self, self.linear_part = query.sampler.affine_con.linear_part self.offset = query.sampler.affine_con.offset - self.logdens_linear = query.sampler.logdens_transform[0] + self.regress_opt = query.sampler.logdens_transform[0] self.cond_mean = query.cond_mean self.prec_opt = np.linalg.inv(query.cond_cov) self.cond_cov = query.cond_cov @@ -132,7 +132,7 @@ def _approx_log_reference(self, raise ValueError('no target specified') prec_target = np.linalg.inv(target_cov) - target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) + target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] solver = solve_barrier_affine_py @@ -290,7 +290,7 @@ def _construct_density(self): target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( (target_linear.shape[0],)) - target_lin = -self.logdens_linear.dot(target_linear) + target_lin = self.regress_opt.dot(target_linear) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( @@ -306,4 +306,4 @@ def _construct_density(self): self.precs = precs self.S = S - self.r = r \ No newline at end of file + self.r = r diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index d4c0decdd..c478d8f45 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -244,19 +244,19 @@ def _setup_implied_gaussian(self): if np.asarray(prec).shape in [(), (0,)]: cond_precision = self.opt_linear.T.dot(self.opt_linear) * prec cond_cov = inv(cond_precision) - logdens_linear = cond_cov.dot(self.opt_linear.T) * prec + regress_opt = -cond_cov.dot(self.opt_linear.T) * prec else: cond_precision = self.opt_linear.T.dot(prec.dot(self.opt_linear)) cond_cov = inv(cond_precision) - logdens_linear = cond_cov.dot(self.opt_linear.T).dot(prec) + regress_opt = -cond_cov.dot(self.opt_linear.T).dot(prec) - cond_mean = -logdens_linear.dot(self.observed_score_state + self.opt_offset) + cond_mean = regress_opt.dot(self.observed_score_state + self.opt_offset) self.cond_mean = cond_mean self.cond_cov = cond_cov self.cond_precision = cond_precision - self.logdens_linear = logdens_linear + self.regress_opt = regress_opt - return cond_mean, cond_cov, cond_precision, logdens_linear + return cond_mean, cond_cov, cond_precision, regress_opt def selective_MLE(self, solve_args={'tol': 1.e-12}, @@ -277,7 +277,7 @@ def selective_MLE(self, init_soln: (opt_state) initial (observed) value of optimization variables cond_mean: conditional mean of optimization variables (model on _setup_implied_gaussian) cond_cov: conditional variance of optimization variables (model on _setup_implied_gaussian) - logdens_linear: (model on _setup_implied_gaussian) + regress_opt: (model on _setup_implied_gaussian) linear_part: like A_scaling (from lasso) offset: like b_scaling (from lasso) solve_args: passed on to solver @@ -292,7 +292,7 @@ def selective_MLE(self, init_soln = self.observed_opt_state # just the gammas cond_mean = self.cond_mean cond_cov = self.cond_cov - logdens_linear = self.logdens_linear + regress_opt = self.regress_opt linear_part = self.linear_part offset = self.offset @@ -308,13 +308,13 @@ def selective_MLE(self, # target_lin determines how the conditional mean of optimization variables # vary with target - # logdens_linear determines how the argument of the optimization density + # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign target_linear = target_score_cov.T.dot(prec_target) target_offset = score_offset - target_linear.dot(observed_target) - target_lin = - logdens_linear.dot(target_linear) + target_lin = regress_opt.dot(target_linear) target_off = cond_mean - target_lin.dot(observed_target) if np.asarray(self.randomizer_prec).shape in [(), (0,)]: @@ -437,7 +437,7 @@ def __init__(self, self.linear_part = query.linear_part self.offset = query.offset - self.logdens_linear = query.logdens_linear + self.regress_opt = query.regress_opt self.cond_mean = query.cond_mean self.prec_opt = np.linalg.inv(query.cond_cov) self.cond_cov = query.cond_cov @@ -528,7 +528,7 @@ def log_reference(self, raise ValueError('no target specified') prec_target = np.linalg.inv(target_cov) - target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) + target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] @@ -547,7 +547,7 @@ def log_reference(self, #direction for decomposing o - eta = -self.prec_opt.dot(self.logdens_linear.dot(target_score_cov.T)) + eta = self.prec_opt.dot(self.regress_opt.dot(target_score_cov.T)) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) @@ -688,7 +688,7 @@ def _construct_density(self): target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( (target_linear.shape[0],)) - target_lin = -self.logdens_linear.dot(target_linear) + target_lin = self.regress_opt.dot(target_linear) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 80169a9a0..9ca4ebe05 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -42,7 +42,7 @@ def __init__(self, self.linear_part = query.sampler.affine_con.linear_part self.offset = query.sampler.affine_con.offset - self.logdens_linear = query.sampler.logdens_transform[0] + self.regress_opt = query.sampler.logdens_transform[0] self.cond_mean = query.cond_mean self.prec_opt = np.linalg.inv(query.cond_cov) self.cond_cov = query.cond_cov @@ -126,7 +126,7 @@ def log_reference(self, raise ValueError('no target specified') prec_target = np.linalg.inv(target_cov) - target_lin = - self.logdens_linear.dot(target_score_cov.T.dot(prec_target)) + target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) ref_hat = [] @@ -145,7 +145,7 @@ def log_reference(self, #direction for decomposing o - eta = -self.prec_opt.dot(self.logdens_linear.dot(target_score_cov.T)) + eta = self.prec_opt.dot(self.regress_opt.dot(target_score_cov.T)) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) @@ -302,7 +302,7 @@ def _construct_density(self): target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( (target_linear.shape[0],)) - target_lin = -self.logdens_linear.dot(target_linear) + target_lin = self.regress_opt.dot(target_linear) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( diff --git a/selectinf/randomized/group_lasso.py b/selectinf/randomized/group_lasso.py index 595651bee..09c239df0 100644 --- a/selectinf/randomized/group_lasso.py +++ b/selectinf/randomized/group_lasso.py @@ -201,7 +201,7 @@ def fit(self, dispersion = 1. (prec_opt_linear, - logdens_linear) = self._get_precision_opt_linear(opt_linear, + regress_opt) = self._get_precision_opt_linear(opt_linear, ordered_vars, dispersion) @@ -231,7 +231,7 @@ def fit(self, self.observed_score_state, log_cond_density, log_det, - (np.atleast_2d(logdens_linear.T[:,idx_g].dot(dir_g).T), + (np.atleast_2d(regress_opt.T[:,idx_g].dot(dir_g).T), opt_offset)) self._samplers[group] = sampler @@ -375,9 +375,9 @@ def _get_precision_opt_linear(self, opt_linear, variables, dispersion=1): cond_precision = opt_linear.T.dot(value) cond_cov = np.linalg.inv(cond_precision) - logdens_linear = cond_cov.dot(value.T) * dispersion # is this last dispersion correct? + regress_opt = -cond_cov.dot(value.T) * dispersion # is this last dispersion correct? - return value, logdens_linear + return value, regress_opt def _solve_randomized_problem(self, perturb=None, diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 8133365ce..ff41f46de 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -898,7 +898,7 @@ def _setup_implied_gaussian(self, # because opt_linear has shape p x E with the columns # being those non-zero columns of the solution. Above S_E = np.diag(signs) # the conditional precision is S_E Q[E][:,E] * pi / ((1 - pi) * dispersion) S_E - # and logdens_linear is Q[E][:,E]^{-1} S_E + # and regress_opt is -Q[E][:,E]^{-1} S_E # padded with zeros # to be E x p @@ -916,12 +916,12 @@ def _setup_implied_gaussian(self, assert(np.linalg.norm(cond_precision - cond_precision.T) / np.linalg.norm(cond_precision) < 1.e-6) cond_cov = np.linalg.inv(cond_precision) - logdens_linear = np.zeros((len(ordered_vars), + regress_opt = np.zeros((len(ordered_vars), self.nfeature)) - logdens_linear[:, ordered_vars] = cond_cov * signs[None, :] / (dispersion * ratio) - cond_mean = -logdens_linear.dot(self.observed_score_state + opt_offset) + regress_opt[:, ordered_vars] = -cond_cov * signs[None, :] / (dispersion * ratio) + cond_mean = regress_opt.dot(self.observed_score_state + opt_offset) - return cond_mean, cond_cov, cond_precision, logdens_linear + return cond_mean, cond_cov, cond_precision, regress_opt def _solve_randomized_problem(self, # optional binary vector diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 85dc64b5e..33f132d17 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -40,7 +40,7 @@ def __init__(self, linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset - logdens_linear = query.sampler.logdens_transform[0] + regress_opt = query.sampler.logdens_transform[0] _, randomizer_prec = query.randomizer.cov_prec score_offset = query.observed_score_state + query.sampler.logdens_transform[1] @@ -59,7 +59,7 @@ def __init__(self, self.observed_target = observed_target self.cov_target_score = cov_target_score - self.logdens_linear = logdens_linear + self.regress_opt = regress_opt self.randomizer_prec = randomizer_prec self.score_offset = score_offset @@ -133,7 +133,7 @@ def _set_marginal_parameters(self): target_linear = self.cov_target_score.T.dot(self.prec_target) target_offset = self.score_offset - target_linear.dot(self.observed_target) - target_lin = -self.logdens_linear.dot(target_linear) + target_lin = self.regress_opt.dot(target_linear) target_off = self.cond_mean - target_lin.dot(self.observed_target) self.linear_coef = target_lin diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 592065367..38b34f2c6 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -118,20 +118,20 @@ def _setup_sampler(self, (cond_mean, cond_cov, cond_precision, - logdens_linear) = self._setup_implied_gaussian(opt_linear, + regress_opt) = self._setup_implied_gaussian(opt_linear, opt_offset, dispersion) - def log_density(logdens_linear, offset, cond_prec, opt, score): + def log_density(regress_opt, offset, cond_prec, opt, score): if score.ndim == 1: - mean_term = logdens_linear.dot(score.T + offset).T + mean_term = regress_opt.dot(score.T + offset).T else: - mean_term = logdens_linear.dot(score.T + offset[:, None]).T - arg = opt + mean_term + mean_term = regress_opt.dot(score.T + offset[:, None]).T + arg = opt - mean_term return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) log_density = functools.partial(log_density, - logdens_linear, + regress_opt, opt_offset, cond_precision) @@ -148,7 +148,7 @@ def log_density(logdens_linear, offset, cond_prec, opt, score): self.observed_opt_state, self.observed_score_state, log_density, - (logdens_linear, opt_offset), + (regress_opt, opt_offset), self.randomizer_prec, selection_info=self.selection_variable, useC=self.useC) @@ -166,15 +166,17 @@ def _setup_implied_gaussian(self, if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec cond_cov = np.linalg.inv(cond_precision) - logdens_linear = cond_cov.dot(opt_linear.T) * prec + regress_opt = -cond_cov.dot(opt_linear.T) * prec else: cond_precision = opt_linear.T.dot(prec.dot(opt_linear)) cond_cov = np.linalg.inv(cond_precision) - logdens_linear = cond_cov.dot(opt_linear.T).dot(prec) + regress_opt = -cond_cov.dot(opt_linear.T).dot(prec) - cond_mean = -logdens_linear.dot(self.observed_score_state + opt_offset) + # regress_opt is regression coefficient of opt onto score + u... - return cond_mean, cond_cov, cond_precision, logdens_linear + cond_mean = regress_opt.dot(self.observed_score_state + opt_offset) + + return cond_mean, cond_cov, cond_precision, regress_opt def summary(self, observed_target, @@ -833,7 +835,7 @@ def __init__(self, observed_score_state, log_cond_density, logdens_transform, # described how score enters log_density. - randomizer_prec, + cov_product, # product score_cov.dot(randomizer_prec), selection_info=None, useC=False): @@ -875,7 +877,7 @@ def __init__(self, self._log_cond_density = log_cond_density self.logdens_transform = logdens_transform self.useC = useC - self.randomizer_prec = randomizer_prec + self.cov_product = cov_product def log_cond_density(self, opt_sample, @@ -977,16 +979,16 @@ def _log_density_ray(self, if (not hasattr(self, "_direction") or not np.all(self._direction == direction)): - logdens_lin, logdens_offset = self.logdens_transform + regress_opt, logdens_offset = self.logdens_transform if opt_sample.shape[1] == 1: prec = 1. / self.covariance[0, 0] - quadratic_term = logdens_lin.dot(direction) ** 2 * prec - arg = (logdens_lin.dot(nuisance + logdens_offset) + - logdens_lin.dot(direction) * gaussian_sample + - opt_sample[:, 0]) - linear_term = logdens_lin.dot(direction) * prec * arg + quadratic_term = regress_opt.dot(direction) ** 2 * prec + arg = (opt_sample[:, 0] - + regress_opt.dot(nuisance + logdens_offset) - + regress_opt.dot(direction) * gaussian_sample) + linear_term = -regress_opt.dot(direction) * prec * arg constant_term = arg ** 2 * prec self._cache = {'linear_term': linear_term, @@ -996,22 +998,22 @@ def _log_density_ray(self, self._direction = direction.copy() # density is a Gaussian evaluated at - # O_i + A(N + (Z_i + theta) * gamma + b) + # O_i - A(N + (Z_i + theta) * gamma + b) # b is logdens_offset - # A is logdens_linear + # A is regress_opt # Z_i is gaussian_sample[i] (real-valued) # gamma is direction # O_i is opt_sample[i] # let arg1 = O_i # let arg2 = A(N+b + Z_i \cdot gamma) - # then it is of the form (arg1 + arg2 + theta * A gamma) + # then it is of the form (arg1 - arg2 - theta * A gamma) - logdens_lin, logdens_offset = self.logdens_transform + regress_opt, logdens_offset = self.logdens_transform cov = self.covariance prec = np.linalg.inv(cov) - linear_part = logdens_lin.dot(direction) # A gamma + linear_part = -regress_opt.dot(direction) # -A gamma if 1 in opt_sample.shape: pass # stop3 what's this for? @@ -1020,10 +1022,10 @@ def _log_density_ray(self, quadratic_term = linear_part.T.dot(prec).dot(linear_part) arg1 = opt_sample.T - arg2 = logdens_lin.dot(np.multiply.outer(direction, gaussian_sample) + - (nuisance + logdens_offset)[:, None]) + arg2 = -regress_opt.dot(np.multiply.outer(direction, gaussian_sample) + + (nuisance + logdens_offset)[:, None]) arg = arg1 + arg2 - linear_term = linear_part.T.dot(prec).dot(arg) + linear_term = -regress_opt.T.dot(prec).dot(arg) constant_term = np.sum(prec.dot(arg) * arg, 0) self._cache = {'linear_term': linear_term, @@ -1312,7 +1314,7 @@ def selective_MLE(observed_target, # only for independent estimator cond_mean, cond_cov, - logdens_linear, + regress_opt, linear_part, offset, randomizer_prec, @@ -1337,7 +1339,7 @@ def selective_MLE(observed_target, Conditional mean of optimization variables given target. cond_cov : ndarray Conditional covariance of optimization variables given target. - logdens_linear : ndarray + regress_opt : ndarray Describes how conditional mean of optimization variables varies with target. linear_part : ndarray @@ -1362,13 +1364,13 @@ def selective_MLE(observed_target, # target_lin determines how the conditional mean of optimization variables # vary with target - # logdens_linear determines how the argument of the optimization density + # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign target_linear = target_score_cov.T.dot(prec_target) target_offset = score_offset - target_linear.dot(observed_target) - target_lin = - logdens_linear.dot(target_linear) + target_lin = regress_opt.dot(target_linear) target_off = cond_mean - target_lin.dot(observed_target) if np.asarray(randomizer_prec).shape in [(), (0,)]: From 6a9a9012b07233a17e7149215731a15d0777163d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 14:06:54 -0700 Subject: [PATCH 104/187] rename target_linear->score_decomp, target_offset->score_resid --- selectinf/randomized/approx_reference.py | 12 ++++----- .../randomized/approx_reference_grouplasso.py | 26 +++++++++---------- selectinf/randomized/exact_reference.py | 12 ++++----- selectinf/randomized/posterior_inference.py | 14 +++++----- selectinf/randomized/query.py | 21 ++++++++------- 5 files changed, 43 insertions(+), 42 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index ee8d81391..62fea3aad 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -286,17 +286,17 @@ def _construct_density(self): prec_target = 1. / target_cov_uni target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) - target_linear = target_score_cov_uni.T.dot(prec_target) - target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( - (target_linear.shape[0],)) + score_decomp = target_score_cov_uni.T.dot(prec_target) + score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( + (score_decomp.shape[0],)) - target_lin = self.regress_opt.dot(target_linear) + target_lin = self.regress_opt.dot(score_decomp) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) - _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( self.prec_opt).dot(target_lin) - _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _P = score_decomp.T.dot(score_resid) * self.randomizer_prec _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) _S = np.linalg.inv(_prec).dot(prec_target) diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index c478d8f45..b7ff1f96c 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -311,20 +311,20 @@ def selective_MLE(self, # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - target_linear = target_score_cov.T.dot(prec_target) - target_offset = score_offset - target_linear.dot(observed_target) + score_decomp = target_score_cov.T.dot(prec_target) + score_resid = score_offset - score_decomp.dot(observed_target) - target_lin = regress_opt.dot(target_linear) + target_lin = regress_opt.dot(score_decomp) target_off = cond_mean - target_lin.dot(observed_target) if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - _P = target_linear.T.dot(target_offset) * self.randomizer_prec - _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + _P = score_decomp.T.dot(score_resid) * self.randomizer_prec + _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( prec_opt).dot( target_lin) else: - _P = target_linear.T.dot(self.randomizer_prec).dot(target_offset) - _prec = prec_target + (target_linear.T.dot(self.randomizer_prec).dot(target_linear)) - target_lin.T.dot( + _P = score_decomp.T.dot(self.randomizer_prec).dot(score_resid) + _prec = prec_target + (score_decomp.T.dot(self.randomizer_prec).dot(score_decomp)) - target_lin.T.dot( prec_opt).dot(target_lin) C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) @@ -684,17 +684,17 @@ def _construct_density(self): prec_target = 1. / target_cov_uni target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) - target_linear = target_score_cov_uni.T.dot(prec_target) - target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( - (target_linear.shape[0],)) + score_decomp = target_score_cov_uni.T.dot(prec_target) + score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( + (score_decomp.shape[0],)) - target_lin = self.regress_opt.dot(target_linear) + target_lin = self.regress_opt.dot(score_decomp) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) - _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( self.prec_opt).dot(target_lin) - _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _P = score_decomp.T.dot(score_resid) * self.randomizer_prec _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) _S = np.linalg.inv(_prec).dot(prec_target) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 9ca4ebe05..28f70aa16 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -298,17 +298,17 @@ def _construct_density(self): prec_target = 1. / target_cov_uni target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) - target_linear = target_score_cov_uni.T.dot(prec_target) - target_offset = (self.score_offset - target_linear.dot(observed_target_uni)).reshape( - (target_linear.shape[0],)) + score_decomp = target_score_cov_uni.T.dot(prec_target) + score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( + (score_decomp.shape[0],)) - target_lin = self.regress_opt.dot(target_linear) + target_lin = self.regress_opt.dot(score_decomp) target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) - _prec = prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) - target_lin.T.dot( + _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( self.prec_opt).dot(target_lin) - _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _P = score_decomp.T.dot(score_resid) * self.randomizer_prec _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) _S = np.linalg.inv(_prec).dot(prec_target) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 33f132d17..a63718aea 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -130,23 +130,23 @@ def _set_marginal_parameters(self): implied mean as a function of the true parameters. """ - target_linear = self.cov_target_score.T.dot(self.prec_target) - target_offset = self.score_offset - target_linear.dot(self.observed_target) + score_decomp = self.cov_target_score.T.dot(self.prec_target) + score_resid = self.score_offset - score_decomp.dot(self.observed_target) - target_lin = self.regress_opt.dot(target_linear) + target_lin = self.regress_opt.dot(score_decomp) target_off = self.cond_mean - target_lin.dot(self.observed_target) self.linear_coef = target_lin self.offset_coef = target_off if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - _prec = self.prec_target + (target_linear.T.dot(target_linear) * self.randomizer_prec) \ + _prec = self.prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) \ - target_lin.T.dot(self.cond_precision).dot(target_lin) - _P = target_linear.T.dot(target_offset) * self.randomizer_prec + _P = score_decomp.T.dot(score_resid) * self.randomizer_prec else: - _prec = self.prec_target + (target_linear.T.dot(self.randomizer_prec).dot(target_linear)) \ + _prec = self.prec_target + (score_decomp.T.dot(self.randomizer_prec).dot(score_decomp)) \ - target_lin.T.dot(self.cond_precision).dot(target_lin) - _P = target_linear.T.dot(self.randomizer_prec).dot(target_offset) + _P = score_decomp.T.dot(self.randomizer_prec).dot(score_resid) _Q = np.linalg.inv(_prec + target_lin.T.dot(self.cond_precision).dot(target_lin)) self.prec_marginal = self.cond_precision - self.cond_precision.dot(target_lin).dot(_Q).dot(target_lin.T).dot(self.cond_precision) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 38b34f2c6..12fcbc8aa 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -119,8 +119,8 @@ def _setup_sampler(self, cond_cov, cond_precision, regress_opt) = self._setup_implied_gaussian(opt_linear, - opt_offset, - dispersion) + opt_offset, + dispersion) def log_density(regress_opt, offset, cond_prec, opt, score): if score.ndim == 1: @@ -950,7 +950,8 @@ def selective_MLE(self, solve_args : dict, optional Arguments passed to solver. """ - score_offset = self.observed_score_state + self.logdens_transform[1] + + score_offset = self.observed_score_state + self.logdens_transform[1] # logdens_transform[1] is observed_subgrad return selective_MLE(observed_target, target_cov, @@ -1367,19 +1368,19 @@ def selective_MLE(observed_target, # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - target_linear = target_score_cov.T.dot(prec_target) - target_offset = score_offset - target_linear.dot(observed_target) + score_decomp = target_score_cov.T.dot(prec_target) + score_resid = score_offset - score_decomp.dot(observed_target) - target_lin = regress_opt.dot(target_linear) + target_lin = regress_opt.dot(score_decomp) target_off = cond_mean - target_lin.dot(observed_target) if np.asarray(randomizer_prec).shape in [(), (0,)]: - _P = target_linear.T.dot(target_offset) * randomizer_prec - _prec = prec_target + (target_linear.T.dot(target_linear) * randomizer_prec) - target_lin.T.dot(prec_opt).dot( + _P = score_decomp.T.dot(score_resid) * randomizer_prec + _prec = prec_target + (score_decomp.T.dot(score_decomp) * randomizer_prec) - target_lin.T.dot(prec_opt).dot( target_lin) else: - _P = target_linear.T.dot(randomizer_prec).dot(target_offset) - _prec = prec_target + (target_linear.T.dot(randomizer_prec).dot(target_linear)) - target_lin.T.dot( + _P = score_decomp.T.dot(randomizer_prec).dot(score_resid) + _prec = prec_target + (score_decomp.T.dot(randomizer_prec).dot(score_decomp)) - target_lin.T.dot( prec_opt).dot(target_lin) C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) From 7fc46cf0e2cb5623424915332f682070c838cdeb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 15:44:04 -0700 Subject: [PATCH 105/187] some more renaming --- selectinf/randomized/approx_reference.py | 74 +++--- .../randomized/approx_reference_grouplasso.py | 149 ++++++----- selectinf/randomized/drop_losers.py | 16 +- selectinf/randomized/exact_reference.py | 76 +++--- selectinf/randomized/group_lasso.py | 52 ++-- selectinf/randomized/lasso.py | 33 ++- selectinf/randomized/modelQ.py | 30 +-- selectinf/randomized/posterior_inference.py | 37 +-- selectinf/randomized/query.py | 236 +++++++++--------- selectinf/randomized/screening.py | 22 +- selectinf/randomized/slope.py | 27 +- 11 files changed, 374 insertions(+), 378 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 62fea3aad..06eb5cd54 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -12,8 +12,8 @@ class approximate_grid_inference(object): def __init__(self, query, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, solve_args={'tol': 1.e-12}, useIP=False): @@ -27,9 +27,9 @@ def __init__(self, to describe implied Gaussian. observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. solve_args : dict, optional Arguments passed to solver. @@ -38,8 +38,8 @@ def __init__(self, self.solve_args = solve_args result, inverse_info = query.selective_MLE(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, solve_args=solve_args)[:2] self.linear_part = query.sampler.affine_con.linear_part @@ -51,15 +51,15 @@ def __init__(self, self.cond_cov = query.cond_cov self.observed_target = observed_target - self.target_score_cov = target_score_cov - self.target_cov = target_cov + self.cov_target_score = cov_target_score + self.cov_target = cov_target - self.init_soln = query.observed_opt_state + self.observed_soln = query.observed_opt_state self.randomizer_prec = query.sampler.randomizer_prec self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] - self.ntarget = ntarget = target_cov.shape[0] + self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) if useIP == False: @@ -121,8 +121,8 @@ def summary(self, def _approx_log_reference(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, grid): """ @@ -131,25 +131,25 @@ def _approx_log_reference(self, if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(target_cov) - target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) + prec_target = np.linalg.inv(cov_target) + regress_opt_target = self.regress_opt.dot(cov_target_score.T.dot(prec_target)) ref_hat = [] solver = solve_barrier_affine_py for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, - # target_lin is "something" times Gamma, + # regress_opt_target is "something" times Gamma, # where "something" comes from implied Gaussian # cond_mean is "something" times D - # Gamma is target_score_cov.T.dot(prec_target) + # Gamma is cov_target_score.T.dot(prec_target) - cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + cond_mean_grid = (regress_opt_target.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) conjugate_arg = self.prec_opt.dot(cond_mean_grid) val, _, _ = solver(conjugate_arg, self.prec_opt, - self.init_soln, + self.observed_soln, self.linear_part, self.offset, **self.solve_args) @@ -165,17 +165,17 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) approx_log_ref = self._approx_log_reference(observed_target_uni, - target_cov_uni, - target_score_cov_uni, + cov_target_uni, + cov_target_score_uni, self.stat_grid[m]) @@ -278,31 +278,31 @@ def _construct_density(self): S = {} r = {} - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - prec_target = 1. / target_cov_uni - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + prec_target = 1. / cov_target_uni + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - score_decomp = target_score_cov_uni.T.dot(prec_target) - score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( - (score_decomp.shape[0],)) + regress_score_target = cov_target_score_uni.T.dot(prec_target) + resid_score_target = (self.score_offset - regress_score_target.dot(observed_target_uni)).reshape( + (regress_score_target.shape[0],)) - target_lin = self.regress_opt.dot(score_decomp) - target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + regress_opt_target = self.regress_opt.dot(regress_score_target) + resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( - self.prec_opt).dot(target_lin) + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + self.prec_opt).dot(regress_opt_target) - _P = score_decomp.T.dot(score_resid) * self.randomizer_prec - _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) _S = np.linalg.inv(_prec).dot(prec_target) S[m] = _S r[m] = _r - precs[m] = _prec + precs[m] = prec_target_nosel self.precs = precs self.S = S diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index b7ff1f96c..3909a2a56 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -60,8 +60,8 @@ def fit(self, perturb=None): # solve the randomized version of group lasso - (self.initial_soln, - self.initial_subgrad) = self._solve_randomized_problem(perturb=perturb, + (self.observed_soln, + self.observed_subgrad) = self._solve_randomized_problem(perturb=perturb, solve_args=solve_args) # initialize variables @@ -81,7 +81,7 @@ def fit(self, for g in sorted(np.unique(self.groups)): # g is group label group_mask = self.groups == g - soln = self.initial_soln # do not need to keep setting this + soln = self.observed_soln # do not need to keep setting this if norm(soln[group_mask]) > tol * norm(soln): # is group g appreciably nonzero ordered_groups.append(g) @@ -127,12 +127,10 @@ def fit(self, for i, var in enumerate(ordered_vars): opt_linearNoU[var, i] += self.ridge_term - opt_offset = self.initial_subgrad - self.observed_score_state = -opt_linearNoU.dot(_beta_unpenalized) self.observed_score_state[~overall] += self.loglike.smooth_objective(beta_bar, 'grad')[~overall] - active_signs = np.sign(self.initial_soln) + active_signs = np.sign(self.observed_soln) active = np.flatnonzero(active_signs) self.active = active @@ -171,7 +169,6 @@ def compute_Lg(g): self.opt_linear = opt_linearNoU.dot(U) self.active_dirs = active_dirs - self.opt_offset = opt_offset self.ordered_vars = ordered_vars self.linear_part = -np.eye(self.observed_opt_state.shape[0]) @@ -198,12 +195,12 @@ def _solve_randomized_problem(self, # if all groups are size 1, set up lasso penalty and run usual lasso solver... (see existing code)... - initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(self.loglike.smooth_objective(initial_soln, + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(self.loglike.smooth_objective(observed_soln, 'grad') + - quad.objective(initial_soln, 'grad')) + quad.objective(observed_soln, 'grad')) - return initial_soln, initial_subgrad + return observed_soln, observed_subgrad @staticmethod def gaussian(X, @@ -250,7 +247,7 @@ def _setup_implied_gaussian(self): cond_cov = inv(cond_precision) regress_opt = -cond_cov.dot(self.opt_linear.T).dot(prec) - cond_mean = regress_opt.dot(self.observed_score_state + self.opt_offset) + cond_mean = regress_opt.dot(self.observed_score_state + self.observed_subgrad) self.cond_mean = cond_mean self.cond_cov = cond_cov self.cond_precision = cond_precision @@ -272,9 +269,9 @@ def selective_MLE(self, Parameters ---------- observed_target: from selected_targets - target_cov: from selected_targets - target_cov_score: from selected_targets - init_soln: (opt_state) initial (observed) value of optimization variables + cov_target: from selected_targets + cov_target_score: from selected_targets + observed_soln: (opt_state) initial (observed) value of optimization variables cond_mean: conditional mean of optimization variables (model on _setup_implied_gaussian) cond_cov: conditional variance of optimization variables (model on _setup_implied_gaussian) regress_opt: (model on _setup_implied_gaussian) @@ -287,9 +284,9 @@ def selective_MLE(self, """ self._setup_implied_gaussian() # Calculate useful quantities - (observed_target, target_cov, target_score_cov, alternatives) = self.selected_targets(dispersion) + (observed_target, cov_target, cov_target_score, alternatives) = self.selected_targets(dispersion) - init_soln = self.observed_opt_state # just the gammas + observed_soln = self.observed_opt_state # just the gammas cond_mean = self.cond_mean cond_cov = self.cond_cov regress_opt = self.regress_opt @@ -300,40 +297,40 @@ def selective_MLE(self, raise ValueError('no target specified') observed_target = np.atleast_1d(observed_target) - prec_target = inv(target_cov) + prec_target = inv(cov_target) prec_opt = self.cond_precision - score_offset = self.observed_score_state + self.opt_offset + score_offset = self.observed_score_state + self.observed_subgrad - # target_lin determines how the conditional mean of optimization variables + # regress_opt_target determines how the conditional mean of optimization variables # vary with target # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - score_decomp = target_score_cov.T.dot(prec_target) - score_resid = score_offset - score_decomp.dot(observed_target) + regress_score_target = cov_target_score.T.dot(prec_target) + resid_score_target = score_offset - regress_score_target.dot(observed_target) - target_lin = regress_opt.dot(score_decomp) - target_off = cond_mean - target_lin.dot(observed_target) + regress_opt_target = regress_opt.dot(regress_score_target) + resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - _P = score_decomp.T.dot(score_resid) * self.randomizer_prec - _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( + _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _prec = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( prec_opt).dot( - target_lin) + regress_opt_target) else: - _P = score_decomp.T.dot(self.randomizer_prec).dot(score_resid) - _prec = prec_target + (score_decomp.T.dot(self.randomizer_prec).dot(score_decomp)) - target_lin.T.dot( - prec_opt).dot(target_lin) + _P = regress_score_target.T.dot(self.randomizer_prec).dot(resid_score_target) + _prec = prec_target + (regress_score_target.T.dot(self.randomizer_prec).dot(regress_score_target)) - regress_opt_target.T.dot( + prec_opt).dot(regress_opt_target) - C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) + C = cov_target.dot(_P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) conjugate_arg = prec_opt.dot(cond_mean) val, soln, hess = solve_barrier_affine_jacobian_py(conjugate_arg, prec_opt, - init_soln, + observed_soln, linear_part, offset, self.C, @@ -341,16 +338,16 @@ def selective_MLE(self, useJacobian, **solve_args) - final_estimator = target_cov.dot(_prec).dot(observed_target) \ - + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) + C + final_estimator = cov_target.dot(_prec).dot(observed_target) \ + + cov_target.dot(regress_opt_target.T.dot(prec_opt.dot(cond_mean - soln))) + C - unbiased_estimator = target_cov.dot(_prec).dot(observed_target) + target_cov.dot( - _P - target_lin.T.dot(prec_opt).dot(target_off)) + unbiased_estimator = cov_target.dot(_prec).dot(observed_target) + cov_target.dot( + _P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) - L = target_lin.T.dot(prec_opt) - observed_info_natural = _prec + L.dot(target_lin) - L.dot(hess.dot(L.T)) + L = regress_opt_target.T.dot(prec_opt) + observed_info_natural = _prec + L.dot(regress_opt_target) - L.dot(hess.dot(L.T)) - observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) + observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) @@ -422,9 +419,9 @@ def __init__(self, to describe implied Gaussian. observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. solve_args : dict, optional Arguments passed to solver. @@ -444,17 +441,17 @@ def __init__(self, self.C = query.C self.active_dirs = query.active_dirs - (observed_target, target_cov, target_score_cov, alternatives) = query.selected_targets(dispersion) + (observed_target, cov_target, cov_target_score, alternatives) = query.selected_targets(dispersion) self.observed_target = observed_target - self.target_score_cov = target_score_cov - self.target_cov = target_cov + self.cov_target_score = cov_target_score + self.cov_target = cov_target - self.init_soln = query.observed_opt_state + self.observed_soln = query.observed_opt_state self.randomizer_prec = query.randomizer_prec - self.score_offset = query.observed_score_state + query.opt_offset + self.score_offset = query.observed_score_state + query.observed_subgrad - self.ntarget = ntarget = target_cov.shape[0] + self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) if useIP == False: @@ -516,8 +513,8 @@ def summary(self, def log_reference(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, grid): """ @@ -527,27 +524,27 @@ def log_reference(self, if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(target_cov) - target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) + prec_target = np.linalg.inv(cov_target) + regress_opt_target = self.regress_opt.dot(cov_target_score.T.dot(prec_target)) ref_hat = [] for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, - # target_lin is "something" times Gamma, + # regress_opt_target is "something" times Gamma, # where "something" comes from implied Gaussian # cond_mean is "something" times D - # Gamma is target_score_cov.T.dot(prec_target) + # Gamma is cov_target_score.T.dot(prec_target) num_opt = self.prec_opt.shape[0] num_con = self.linear_part.shape[0] - cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + cond_mean_grid = (regress_opt_target.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) #direction for decomposing o - eta = self.prec_opt.dot(self.regress_opt.dot(target_score_cov.T)) + eta = self.prec_opt.dot(self.regress_opt.dot(cov_target_score.T)) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) @@ -557,18 +554,18 @@ def log_reference(self, R = np.identity(num_opt) - _A.dot(eta.T) A = self.linear_part.dot(_A).reshape((-1,)) - b = self.offset-self.linear_part.dot(R).dot(self.init_soln) + b = self.offset-self.linear_part.dot(R).dot(self.observed_soln) conjugate_arg = implied_mean * implied_prec val, soln, _ = solver(np.asarray([conjugate_arg]), np.reshape(implied_prec, (1,1)), - eta.T.dot(self.init_soln), + eta.T.dot(self.observed_soln), A.reshape((A.shape[0],1)), b, **self.solve_args) - gamma_ = _A.dot(soln) + R.dot(self.init_soln) + gamma_ = _A.dot(soln) + R.dot(self.observed_soln) log_jacob = jacobian_grad_hess(gamma_, self.C, self.active_dirs) ref_hat.append(-val - ((conjugate_arg ** 2) * implied_cov)/ 2. + log_jacob[0]) @@ -582,17 +579,17 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) log_ref = self.log_reference(observed_target_uni, - target_cov_uni, - target_score_cov_uni, + cov_target_uni, + cov_target_score_uni, self.stat_grid[m]) if self.useIP == False: logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) @@ -676,26 +673,26 @@ def _construct_density(self): S = {} r = {} - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - prec_target = 1. / target_cov_uni - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + prec_target = 1. / cov_target_uni + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - score_decomp = target_score_cov_uni.T.dot(prec_target) - score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( - (score_decomp.shape[0],)) + regress_score_target = cov_target_score_uni.T.dot(prec_target) + resid_score_target = (self.score_offset - regress_score_target.dot(observed_target_uni)).reshape( + (regress_score_target.shape[0],)) - target_lin = self.regress_opt.dot(score_decomp) - target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + regress_opt_target = self.regress_opt.dot(regress_score_target) + resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( - self.prec_opt).dot(target_lin) + _prec = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + self.prec_opt).dot(regress_opt_target) - _P = score_decomp.T.dot(score_resid) * self.randomizer_prec - _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) _S = np.linalg.inv(_prec).dot(prec_target) S[m] = _S diff --git a/selectinf/randomized/drop_losers.py b/selectinf/randomized/drop_losers.py index 7c2a7bce6..ac3134144 100644 --- a/selectinf/randomized/drop_losers.py +++ b/selectinf/randomized/drop_losers.py @@ -84,13 +84,13 @@ def MLE_inference(self, observed_target = self.means[self._winners] std_win = self.std.loc[self._winners] - target_cov = np.diag(std_win**2 / (self._n1_win + self._n2_win)) - target_score_cov = -target_cov + cov_target = np.diag(std_win**2 / (self._n1_win + self._n2_win)) + cov_target_score = -cov_target result = gaussian_query.selective_MLE(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, level=level, solve_args=solve_args) result[0].insert(0, 'arm', self._winners) @@ -120,13 +120,13 @@ def summary(self, """ observed_target = self.means[self._winners] std_win = self.std.loc[self._winners] - target_cov = np.diag(std_win**2 / (self._n1_win + self._n2_win)) - target_score_cov = -target_cov + cov_target = np.diag(std_win**2 / (self._n1_win + self._n2_win)) + cov_target_score = -cov_target result = gaussian_query.summary(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, alternatives=['twosided']*self.K, ndraw=ndraw, level=level, diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 28f70aa16..fe7cc0885 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -11,8 +11,8 @@ class exact_grid_inference(object): def __init__(self, query, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, solve_args={'tol': 1.e-12}, useIP=False): @@ -26,17 +26,17 @@ def __init__(self, to describe implied Gaussian. observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. solve_args : dict, optional Arguments passed to solver. """ result, inverse_info = query.selective_MLE(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, solve_args=solve_args)[:2] self.linear_part = query.sampler.affine_con.linear_part @@ -48,15 +48,15 @@ def __init__(self, self.cond_cov = query.cond_cov self.observed_target = observed_target - self.target_score_cov = target_score_cov - self.target_cov = target_cov + self.cov_target_score = cov_target_score + self.cov_target = cov_target - self.init_soln = query.observed_opt_state + self.observed_soln = query.observed_opt_state self.randomizer_prec = query.sampler.randomizer_prec self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] - self.ntarget = ntarget = target_cov.shape[0] + self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) if useIP == False: @@ -118,34 +118,34 @@ def summary(self, def log_reference(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, grid): if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(target_cov) - target_lin = self.regress_opt.dot(target_score_cov.T.dot(prec_target)) + prec_target = np.linalg.inv(cov_target) + regress_opt_target = self.regress_opt.dot(cov_target_score.T.dot(prec_target)) ref_hat = [] for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, - # target_lin is "something" times Gamma, + # regress_opt_target is "something" times Gamma, # where "something" comes from implied Gaussian # cond_mean is "something" times D - # Gamma is target_score_cov.T.dot(prec_target) + # Gamma is cov_target_score.T.dot(prec_target) num_opt = self.prec_opt.shape[0] num_con = self.linear_part.shape[0] - cond_mean_grid = (target_lin.dot(np.atleast_1d(grid[k] - observed_target)) + + cond_mean_grid = (regress_opt_target.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) #direction for decomposing o - eta = self.prec_opt.dot(self.regress_opt.dot(target_score_cov.T)) + eta = self.prec_opt.dot(self.regress_opt.dot(cov_target_score.T)) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) @@ -155,7 +155,7 @@ def log_reference(self, R = np.identity(num_opt) - _A.dot(eta.T) A = self.linear_part.dot(_A).reshape((-1,)) - b = -self.linear_part.dot(R).dot(self.init_soln) + b = -self.linear_part.dot(R).dot(self.observed_soln) trunc_ = np.true_divide((self.offset + b), A) @@ -197,17 +197,17 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) log_ref = self.log_reference(observed_target_uni, - target_cov_uni, - target_score_cov_uni, + cov_target_uni, + cov_target_score_uni, self.stat_grid[m]) if self.useIP == False: logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) @@ -290,31 +290,31 @@ def _construct_density(self): S = {} r = {} - p = self.target_score_cov.shape[1] + p = self.cov_target_score.shape[1] for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) - target_cov_uni = (np.diag(self.target_cov)[m]).reshape((1, 1)) - prec_target = 1. / target_cov_uni - target_score_cov_uni = self.target_score_cov[m, :].reshape((1, p)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + prec_target = 1. / cov_target_uni + cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) - score_decomp = target_score_cov_uni.T.dot(prec_target) - score_resid = (self.score_offset - score_decomp.dot(observed_target_uni)).reshape( - (score_decomp.shape[0],)) + regress_score_target = cov_target_score_uni.T.dot(prec_target) + resid_score_target = (self.score_offset - regress_score_target.dot(observed_target_uni)).reshape( + (regress_score_target.shape[0],)) - target_lin = self.regress_opt.dot(score_decomp) - target_off = (self.cond_mean - target_lin.dot(observed_target_uni)).reshape((target_lin.shape[0],)) + regress_opt_target = self.regress_opt.dot(regress_score_target) + resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - _prec = prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) - target_lin.T.dot( - self.prec_opt).dot(target_lin) + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + self.prec_opt).dot(regress_opt_target) - _P = score_decomp.T.dot(score_resid) * self.randomizer_prec - _r = (1. / _prec).dot(target_lin.T.dot(self.prec_opt).dot(target_off) - _P) + _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) _S = np.linalg.inv(_prec).dot(prec_target) S[m] = _S r[m] = _r - precs[m] = _prec + precs[m] = prec_target_nosel self.precs = precs self.S = S diff --git a/selectinf/randomized/group_lasso.py b/selectinf/randomized/group_lasso.py index 09c239df0..4f1860599 100644 --- a/selectinf/randomized/group_lasso.py +++ b/selectinf/randomized/group_lasso.py @@ -103,8 +103,8 @@ def fit(self, p = self.nfeature - (self.initial_soln, - self.initial_subgrad) = self._solve_randomized_problem( + (self.observed_soln, + self.observed_subgrad) = self._solve_randomized_problem( perturb=perturb, solve_args=solve_args) @@ -124,7 +124,7 @@ def fit(self, for g in sorted(np.unique(self.penalty.groups)): group = self.penalty.groups == g - soln = self.initial_soln + soln = self.observed_soln if np.linalg.norm(soln[group]) * tol * np.linalg.norm(soln): ordered_groups.append(g) ordered_vars.extend(np.nonzero(group)[0]) @@ -184,8 +184,6 @@ def fit(self, for i, var in enumerate(ordered_vars): opt_linear[var, i] += self.ridge_term - opt_offset = self.initial_subgrad - # for group LASSO, we will have # a different sampler for each group # based on conditioning on all scalings @@ -209,9 +207,8 @@ def fit(self, ordered_groups, ordered_vars, opt_linear, - opt_offset, self.observed_score_state, - self.initial_subgrad, + self.observed_subgrad, self.penalty, prec_opt_linear).items(): @@ -232,7 +229,7 @@ def fit(self, log_cond_density, log_det, (np.atleast_2d(regress_opt.T[:,idx_g].dot(dir_g).T), - opt_offset)) + self.observed_subgrad)) self._samplers[group] = sampler self._setup = True @@ -242,8 +239,8 @@ def fit(self, def summary(self, observed_target, group_assignments, - target_cov, - target_score_cov, + cov_target, + cov_target_score, alternatives, parameter=None, level=0.9, @@ -268,8 +265,8 @@ def summary(self, intervals_) = self._inference_for_target( observed_target[group_idx], group, - target_cov[group_idx][:, group_idx], - target_score_cov[group_idx], + cov_target[group_idx][:, group_idx], + cov_target_score[group_idx], [alternatives[i] for i in np.nonzero(group_idx)[0]], parameter=parameter[group_idx], level=level, @@ -284,8 +281,8 @@ def summary(self, def _inference_for_target(self, observed_target, group, - target_cov, - target_score_cov, + cov_target, + cov_target_score, alternatives, opt_sample=None, target_sample=None, @@ -332,8 +329,8 @@ def _inference_for_target(self, ndraw = opt_sample.shape[0] pivots = sampler.coefficient_pvalues(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, parameter=parameter, sample=(opt_sample, logW), normal_sample=target_sample, @@ -341,8 +338,8 @@ def _inference_for_target(self, if not np.all(parameter == 0): pvalues = sampler.coefficient_pvalues(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, parameter=np.zeros_like(parameter), sample=(opt_sample, logW), normal_sample=target_sample, @@ -354,8 +351,8 @@ def _inference_for_target(self, if compute_intervals: intervals = sampler.confidence_intervals(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, sample=(opt_sample, logW), normal_sample=target_sample, level=level) @@ -396,12 +393,12 @@ def _solve_randomized_problem(self, problem = rr.simple_problem(self.loglike, self.penalty) - initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(self.loglike.smooth_objective(initial_soln, + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(self.loglike.smooth_objective(observed_soln, 'grad') + - quad.objective(initial_soln, 'grad')) + quad.objective(observed_soln, 'grad')) - return initial_soln, initial_subgrad + return observed_soln, observed_subgrad @staticmethod def gaussian(X, @@ -858,7 +855,6 @@ def _reference_density_info(soln, ordered_groups, # ordering is used in assumptions about columns opt_linear ordered_variables, opt_linear, - opt_offset, observed_score_state, observed_subgrad, group_lasso_penalty, @@ -1064,12 +1060,12 @@ def sample(self, ndraw): def selective_MLE(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, # initial (observed) value of optimization variables -- # used as a feasible point. # precise value used only for independent estimator - init_soln, + observed_soln, solve_args={'tol':1.e-12}, level=0.9): diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index ff41f46de..4936896b1 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -105,12 +105,12 @@ def fit(self, p = self.nfeature - (self.initial_soln, - self.initial_subgrad) = self._solve_randomized_problem( + (self.observed_soln, + self.observed_subgrad) = self._solve_randomized_problem( perturb=perturb, solve_args=solve_args) - active_signs = np.sign(self.initial_soln) + active_signs = np.sign(self.observed_soln) active = self._active = active_signs != 0 self._lagrange = self.penalty.weights @@ -133,8 +133,8 @@ def fit(self, # initial state for opt variables - initial_scalings = np.fabs(self.initial_soln[active]) - initial_unpenalized = self.initial_soln[self._unpenalized] + initial_scalings = np.fabs(self.observed_soln[active]) + initial_unpenalized = self.observed_soln[self._unpenalized] self.observed_opt_state = np.concatenate([initial_scalings, initial_unpenalized]) @@ -227,7 +227,6 @@ def signed_basis_vector(p, j, s): + self.ridge_term * unpenalized_directions) - opt_offset = self.initial_subgrad self.opt_linear = opt_linear # now make the constraints and implied gaussian @@ -238,7 +237,7 @@ def signed_basis_vector(p, j, s): self._setup_sampler_data = (A_scaling[:active.sum()], b_scaling[:active.sum()], opt_linear, - opt_offset) + self.observed_subgrad) if num_opt_var > 0: self._setup_sampler(*self._setup_sampler_data) @@ -261,12 +260,12 @@ def _solve_randomized_problem(self, problem = rr.simple_problem(self.loglike, self.penalty) - initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(self.loglike.smooth_objective(initial_soln, + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(self.loglike.smooth_objective(observed_soln, 'grad') + - quad.objective(initial_soln, 'grad')) + quad.objective(observed_soln, 'grad')) - return initial_soln, initial_subgrad + return observed_soln, observed_subgrad @staticmethod def gaussian(X, @@ -888,7 +887,7 @@ def fit(self, def _setup_implied_gaussian(self, opt_linear, - opt_offset, + observed_subgrad, dispersion): # key observation is that the covariance of the added noise is @@ -919,7 +918,7 @@ def _setup_implied_gaussian(self, regress_opt = np.zeros((len(ordered_vars), self.nfeature)) regress_opt[:, ordered_vars] = -cond_cov * signs[None, :] / (dispersion * ratio) - cond_mean = regress_opt.dot(self.observed_score_state + opt_offset) + cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) return cond_mean, cond_cov, cond_precision, regress_opt @@ -950,12 +949,12 @@ def _solve_randomized_problem(self, randomized_loss.coef *= inv_frac problem = rr.simple_problem(randomized_loss, self.penalty) - initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(randomized_loss.smooth_objective(initial_soln, + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(randomized_loss.smooth_objective(observed_soln, 'grad') + - quad.objective(initial_soln, 'grad')) + quad.objective(observed_soln, 'grad')) - return initial_soln, initial_subgrad + return observed_soln, observed_subgrad @staticmethod def gaussian(X, diff --git a/selectinf/randomized/modelQ.py b/selectinf/randomized/modelQ.py index d960af043..62aa37b47 100644 --- a/selectinf/randomized/modelQ.py +++ b/selectinf/randomized/modelQ.py @@ -114,9 +114,9 @@ def fit(self, quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) quad_data = rr.identity_quadratic(0, 0, -self.X.T.dot(self.y), 0) problem = rr.simple_problem(self.loss, self.penalty) - self.initial_soln = problem.solve(quad + quad_data, **solve_args) + self.observed_soln = problem.solve(quad + quad_data, **solve_args) - active_signs = np.sign(self.initial_soln) + active_signs = np.sign(self.observed_soln) active = self._active = active_signs != 0 self._lagrange = self.penalty.weights @@ -135,13 +135,13 @@ def fit(self, # initial state for opt variables - initial_subgrad = -(self.loss.smooth_objective(self.initial_soln, 'grad') + - quad_data.objective(self.initial_soln, 'grad') + - quad.objective(self.initial_soln, 'grad')) - self.initial_subgrad = initial_subgrad + observed_subgrad = -(self.loss.smooth_objective(self.observed_soln, 'grad') + + quad_data.objective(self.observed_soln, 'grad') + + quad.objective(self.observed_soln, 'grad')) + self.observed_subgrad = observed_subgrad - initial_scalings = np.fabs(self.initial_soln[active]) - initial_unpenalized = self.initial_soln[self._unpenalized] + initial_scalings = np.fabs(self.observed_soln[active]) + initial_unpenalized = self.observed_soln[self._unpenalized] self.observed_opt_state = np.concatenate([initial_scalings, initial_unpenalized]) @@ -210,7 +210,7 @@ def signed_basis_vector(p, j, s): # two transforms that encode score and optimization # variable roles - self.opt_transform = (_opt_linear_term, self.initial_subgrad) + self.opt_transform = (_opt_linear_term, self.observed_subgrad) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections @@ -224,7 +224,7 @@ def signed_basis_vector(p, j, s): # compute implied mean and covariance - opt_linear, opt_offset = self.opt_transform + opt_linear, observed_subgrad = self.opt_transform A_scaling = -np.identity(self.num_opt_var) b_scaling = np.zeros(self.num_opt_var) @@ -232,7 +232,7 @@ def signed_basis_vector(p, j, s): self._setup_sampler(A_scaling, b_scaling, opt_linear, - opt_offset) + observed_subgrad) return active_signs @@ -417,9 +417,9 @@ def selected_targets(self, features=None, dispersion=None): Xfeat = X[:,features] Qfeat = self.Q[features][:,features] - Gfeat = self.loss.smooth_objective(self.initial_soln, 'grad')[features] - Xfeat.T.dot(y) + Gfeat = self.loss.smooth_objective(self.observed_soln, 'grad')[features] - Xfeat.T.dot(y) Qfeat_inv = np.linalg.inv(Qfeat) - one_step = self.initial_soln[features] - Qfeat_inv.dot(Gfeat) + one_step = self.observed_soln[features] - Qfeat_inv.dot(Gfeat) cov_target = Qfeat_inv.dot(Xfeat.T.dot(Xfeat)).dot(Qfeat_inv) _score_linear = -self.Q[features] crosscov_target_score = _score_linear.dot(cov_target) @@ -447,9 +447,9 @@ def full_targets(self, features=None, dispersion=None): # target is one-step estimator Qfull = self.Q - G = self.loss.smooth_objective(self.initial_soln, 'grad') - X.T.dot(y) + G = self.loss.smooth_objective(self.observed_soln, 'grad') - X.T.dot(y) Qfull_inv = np.linalg.inv(Qfull) - one_step = self.initial_soln - Qfull_inv.dot(G) + one_step = self.observed_soln - Qfull_inv.dot(G) cov_target = Qfull_inv[features][:,features] observed_target = one_step[features] crosscov_target_score = np.zeros((p, cov_target.shape[0])) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index a63718aea..ea1d1fbf9 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -109,10 +109,11 @@ def log_posterior(self, log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal) / 2. - log_lik = -((self.observed_target - target).T.dot(self._prec).dot(self.observed_target - target)) / 2. \ + _prec = self.prec_target_nosel # shorthand + log_lik = -((self.observed_target - target).T.dot(_prec).dot(self.observed_target - target)) / 2. \ - log_normalizer - grad_lik = self.S.T.dot(self._prec.dot(self.observed_target) - self._prec.dot(target) - self.linear_coef.T.dot( + grad_lik = self.S.T.dot(_prec.dot(self.observed_target) - _prec.dot(target) - self.linear_coef.T.dot( prec_marginal.dot(soln) - conjugate_marginal)) log_prior, grad_prior = self.prior(target_parameter) @@ -130,34 +131,34 @@ def _set_marginal_parameters(self): implied mean as a function of the true parameters. """ - score_decomp = self.cov_target_score.T.dot(self.prec_target) - score_resid = self.score_offset - score_decomp.dot(self.observed_target) + regress_score_target = self.cov_target_score.T.dot(self.prec_target) + resid_score_target = self.score_offset - regress_score_target.dot(self.observed_target) - target_lin = self.regress_opt.dot(score_decomp) - target_off = self.cond_mean - target_lin.dot(self.observed_target) + regress_opt_target = self.regress_opt.dot(regress_score_target) + resid_mean_opt_target = self.cond_mean - regress_opt_target.dot(self.observed_target) - self.linear_coef = target_lin - self.offset_coef = target_off + self.linear_coef = regress_opt_target + self.offset_coef = resid_mean_opt_target if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - _prec = self.prec_target + (score_decomp.T.dot(score_decomp) * self.randomizer_prec) \ - - target_lin.T.dot(self.cond_precision).dot(target_lin) - _P = score_decomp.T.dot(score_resid) * self.randomizer_prec + prec_target_nosel = self.prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) \ + - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) + _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec else: - _prec = self.prec_target + (score_decomp.T.dot(self.randomizer_prec).dot(score_decomp)) \ - - target_lin.T.dot(self.cond_precision).dot(target_lin) - _P = score_decomp.T.dot(self.randomizer_prec).dot(score_resid) + prec_target_nosel = self.prec_target + (regress_score_target.T.dot(self.randomizer_prec).dot(regress_score_target)) \ + - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) + _P = regress_score_target.T.dot(self.randomizer_prec).dot(resid_score_target) - _Q = np.linalg.inv(_prec + target_lin.T.dot(self.cond_precision).dot(target_lin)) - self.prec_marginal = self.cond_precision - self.cond_precision.dot(target_lin).dot(_Q).dot(target_lin.T).dot(self.cond_precision) + _Q = np.linalg.inv(_prec + regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target)) + self.prec_marginal = self.cond_precision - self.cond_precision.dot(regress_opt_target).dot(_Q).dot(regress_opt_target.T).dot(self.cond_precision) - r = np.linalg.inv(_prec).dot(target_lin.T.dot(self.cond_precision).dot(target_off) - _P) + r = np.linalg.inv(_prec).dot(regress_opt_target.T.dot(self.cond_precision).dot(resid_mean_opt_target) - _P) S = np.linalg.inv(_prec).dot(self.prec_target) self.r = r self.S = S #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) - self._prec = _prec + self.prec_target_nosel = prec_target_nosel ### sampling methods diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 12fcbc8aa..9859f693f 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -106,7 +106,7 @@ def _setup_sampler(self, linear_part, offset, opt_linear, - opt_offset, + observed_subgrad, # optional dispersion parameter # for covariance of randomization dispersion=1): @@ -119,20 +119,20 @@ def _setup_sampler(self, cond_cov, cond_precision, regress_opt) = self._setup_implied_gaussian(opt_linear, - opt_offset, + observed_subgrad, dispersion) - def log_density(regress_opt, offset, cond_prec, opt, score): + def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: - mean_term = regress_opt.dot(score.T + offset).T + mean_term = regress_opt.dot(score.T + u).T else: - mean_term = regress_opt.dot(score.T + offset[:, None]).T + mean_term = regress_opt.dot(score.T + u[:, None]).T arg = opt - mean_term return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) log_density = functools.partial(log_density, regress_opt, - opt_offset, + observed_subgrad, cond_precision) @@ -148,14 +148,15 @@ def log_density(regress_opt, offset, cond_prec, opt, score): self.observed_opt_state, self.observed_score_state, log_density, - (regress_opt, opt_offset), + regress_opt, + observed_subgrad, self.randomizer_prec, selection_info=self.selection_variable, useC=self.useC) def _setup_implied_gaussian(self, opt_linear, - opt_offset, + observed_subgrad, # optional dispersion parameter # for covariance of randomization dispersion=1): @@ -174,14 +175,14 @@ def _setup_implied_gaussian(self, # regress_opt is regression coefficient of opt onto score + u... - cond_mean = regress_opt.dot(self.observed_score_state + opt_offset) + cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) return cond_mean, cond_cov, cond_precision, regress_opt def summary(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, alternatives, opt_sample=None, target_sample=None, @@ -197,9 +198,9 @@ def summary(self, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. alternatives : [str], optional Sequence of strings describing the alternatives, @@ -232,8 +233,8 @@ def summary(self, ndraw = opt_sample.shape[0] pivots = self.sampler.coefficient_pvalues(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, parameter=parameter, sample=(opt_sample, logW), normal_sample=target_sample, @@ -241,8 +242,8 @@ def summary(self, if not np.all(parameter == 0): pvalues = self.sampler.coefficient_pvalues(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, parameter=np.zeros_like(parameter), sample=(opt_sample, logW), normal_sample=target_sample, @@ -255,14 +256,14 @@ def summary(self, if compute_intervals: MLE = self.selective_MLE(observed_target, - target_cov, - target_score_cov)[0] + cov_target, + cov_target_score)[0] MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) intervals = self.sampler.confidence_intervals( observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, sample=(opt_sample, logW), normal_sample=target_sample, initial_guess=MLE_intervals, @@ -279,8 +280,8 @@ def summary(self, def selective_MLE(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, level=0.9, solve_args={'tol': 1.e-12}): """ @@ -288,9 +289,9 @@ def selective_MLE(self, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. level : float, optional Confidence level. @@ -299,16 +300,16 @@ def selective_MLE(self, """ return self.sampler.selective_MLE(observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, self.observed_opt_state, level=level, solve_args=solve_args) def posterior(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, prior=None, dispersion=None, solve_args={'tol': 1.e-12}): @@ -317,9 +318,9 @@ def posterior(self, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. prior : callable A callable object that takes a single argument @@ -336,7 +337,7 @@ def posterior(self, print('Using dispersion parameter 1...') if prior is None: - Di = 1. / (200 * np.diag(target_cov)) + Di = 1. / (200 * np.diag(cov_target)) def prior(target_parameter): grad_prior = -target_parameter * Di @@ -345,16 +346,16 @@ def prior(target_parameter): return posterior(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, prior, dispersion, solve_args=solve_args) def approximate_grid_inference(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, alternatives=None, solve_args={'tol': 1.e-12}): @@ -363,9 +364,9 @@ def approximate_grid_inference(self, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. alternatives : [str], optional Sequence of strings describing the alternatives, @@ -376,8 +377,8 @@ def approximate_grid_inference(self, G = approximate_grid_inference(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, solve_args=solve_args) return G.summary(alternatives=alternatives) @@ -420,8 +421,8 @@ def fit(self): def summary(self, observed_target, - opt_sampling_info, # a sequence of (target_cov, score_cov) - # objects in theory all target_cov + opt_sampling_info, # a sequence of (cov_target, score_cov) + # objects in theory all cov_target # should be about the same... alternatives=None, parameter=None, @@ -613,7 +614,7 @@ def log_cond_density(self, def hypothesis_test(self, test_stat, observed_value, - target_cov, + cov_target, score_cov, sample_args=(), sample=None, @@ -661,7 +662,7 @@ def hypothesis_test(self, sample_test_stat = np.squeeze(np.array([test_stat(x) for x in sample])) - target_inv_cov = np.linalg.inv(target_cov) + target_inv_cov = np.linalg.inv(cov_target) delta = target_inv_cov.dot(parameter - self.reference) W = np.exp(sample.dot(delta) + logW) @@ -677,7 +678,7 @@ def hypothesis_test(self, def confidence_intervals(self, observed_target, - target_cov, + cov_target, score_cov, sample_args=(), sample=None, @@ -725,7 +726,7 @@ def confidence_intervals(self, _intervals = optimization_intervals([(self, sample, logW, - target_cov, + cov_target, score_cov)], observed_target, ndraw, @@ -747,7 +748,7 @@ def confidence_intervals(self, def coefficient_pvalues(self, observed_target, - target_cov, + cov_target, score_cov, parameter=None, sample_args=(), @@ -795,7 +796,7 @@ def coefficient_pvalues(self, _intervals = optimization_intervals([(self, sample, logW, - target_cov, + cov_target, score_cov)], observed_target, ndraw, @@ -834,8 +835,9 @@ def __init__(self, initial_point, observed_score_state, log_cond_density, - logdens_transform, # described how score enters log_density. - cov_product, # product score_cov.dot(randomizer_prec), + regress_opt, + observed_subgrad, + randomizer_prec, selection_info=None, useC=False): @@ -855,9 +857,9 @@ def __init__(self, $-X^Ty$. log_cond_density : callable Density of optimization variables given score - logdens_transform : tuple - Description of how conditional mean - of optimization variables depends on score. + regress_opt: ndarray + Regression coefficient of opt on to score + observed_subgrad : ndarray selection_info : optional Function of optimization variables that will be conditioned on. @@ -875,9 +877,10 @@ def __init__(self, self.observed_score_state = observed_score_state self.selection_info = selection_info self._log_cond_density = log_cond_density - self.logdens_transform = logdens_transform + self.regress_opt = regress_opt + self.observed_subgrad = observed_subgrad self.useC = useC - self.cov_product = cov_product + self.randomizer_prec = randomizer_prec def log_cond_density(self, opt_sample, @@ -924,12 +927,12 @@ def sample(self, ndraw, burnin): def selective_MLE(self, observed_target, - target_cov, - target_score_cov, + cov_target, + cov_target_score, # initial (observed) value of optimization variables -- # used as a feasible point. # precise value used only for independent estimator - init_soln, + observed_soln, solve_args={'tol': 1.e-12}, level=0.9): """ @@ -939,11 +942,11 @@ def selective_MLE(self, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. - init_soln : ndarray + observed_soln : ndarray Feasible point for optimization problem. level : float, optional Confidence level. @@ -951,15 +954,15 @@ def selective_MLE(self, Arguments passed to solver. """ - score_offset = self.observed_score_state + self.logdens_transform[1] # logdens_transform[1] is observed_subgrad + score_offset = self.observed_score_state + self.observed_subgrad return selective_MLE(observed_target, - target_cov, - target_score_cov, - init_soln, + cov_target, + cov_target_score, + observed_soln, self.mean, self.covariance, - self.logdens_transform[0], + self.regress_opt, self.affine_con.linear_part, self.affine_con.offset, self.randomizer_prec, @@ -980,14 +983,14 @@ def _log_density_ray(self, if (not hasattr(self, "_direction") or not np.all(self._direction == direction)): - regress_opt, logdens_offset = self.logdens_transform + regress_opt, subgrad = self.regress_opt, self.observed_subgrad if opt_sample.shape[1] == 1: prec = 1. / self.covariance[0, 0] quadratic_term = regress_opt.dot(direction) ** 2 * prec arg = (opt_sample[:, 0] - - regress_opt.dot(nuisance + logdens_offset) - + regress_opt.dot(nuisance + subgrad) - regress_opt.dot(direction) * gaussian_sample) linear_term = -regress_opt.dot(direction) * prec * arg constant_term = arg ** 2 * prec @@ -999,19 +1002,19 @@ def _log_density_ray(self, self._direction = direction.copy() # density is a Gaussian evaluated at - # O_i - A(N + (Z_i + theta) * gamma + b) + # O_i - A(N + (Z_i + theta) * gamma + u) - # b is logdens_offset + # u is observed_subgrad # A is regress_opt # Z_i is gaussian_sample[i] (real-valued) # gamma is direction # O_i is opt_sample[i] # let arg1 = O_i - # let arg2 = A(N+b + Z_i \cdot gamma) + # let arg2 = A(N+u + Z_i \cdot gamma) # then it is of the form (arg1 - arg2 - theta * A gamma) - regress_opt, logdens_offset = self.logdens_transform + regress_opt, subgrad = self.regress_opt, self.observed_subgrad cov = self.covariance prec = np.linalg.inv(cov) linear_part = -regress_opt.dot(direction) # -A gamma @@ -1024,7 +1027,7 @@ def _log_density_ray(self, arg1 = opt_sample.T arg2 = -regress_opt.dot(np.multiply.outer(direction, gaussian_sample) + - (nuisance + logdens_offset)[:, None]) + (nuisance + subgrad)[:, None]) arg = arg1 + arg2 linear_term = -regress_opt.T.dot(prec).dot(arg) constant_term = np.sum(prec.dot(arg) * arg, 0) @@ -1048,13 +1051,13 @@ def __init__(self, # (opt_sampler, # opt_sample, # opt_logweights, - # target_cov, + # cov_target, # score_cov) objects - # in theory all target_cov + # in theory all cov_target # should be about the same... observed, nsample, # how large a normal sample - target_cov=None, + cov_target=None, normal_sample=None): # not all opt_samples will be of the same size as nsample @@ -1112,16 +1115,16 @@ def __init__(self, # average covariances in case they might be different - if target_cov is None: - self.target_cov = 0 - for _, _, _, target_cov, _ in opt_sampling_info: - self.target_cov += target_cov - self.target_cov /= len(opt_sampling_info) + if cov_target is None: + self.cov_target = 0 + for _, _, _, cov_target, _ in opt_sampling_info: + self.cov_target += cov_target + self.cov_target /= len(opt_sampling_info) if normal_sample is None: self._normal_sample = np.random.multivariate_normal( - mean=np.zeros(self.target_cov.shape[0]), - cov=self.target_cov, + mean=np.zeros(self.cov_target.shape[0]), + cov=self.cov_target, size=(nsample,)) else: self._normal_sample = normal_sample @@ -1144,7 +1147,7 @@ def pivot(self, observed_stat = self.observed.dot(linear_func) sample_stat = self._normal_sample.dot(linear_func) - target_cov = linear_func.dot(self.target_cov.dot(linear_func)) + cov_target = linear_func.dot(self.cov_target.dot(linear_func)) nuisance = [] translate_dirs = [] @@ -1153,18 +1156,18 @@ def pivot(self, opt_sample, _, _, - target_score_cov) in self.opt_sampling_info: - cur_score_cov = linear_func.dot(target_score_cov) + cov_target_score) in self.opt_sampling_info: + cur_score_cov = linear_func.dot(cov_target_score) # cur_nuisance is in the view's score coordinates - cur_nuisance = opt_sampler.observed_score_state - cur_score_cov * observed_stat / target_cov + cur_nuisance = opt_sampler.observed_score_state - cur_score_cov * observed_stat / cov_target nuisance.append(cur_nuisance) - translate_dirs.append(cur_score_cov / target_cov) + translate_dirs.append(cur_score_cov / cov_target) weights = self._weights(sample_stat, # normal sample candidate, # candidate value nuisance, # nuisance sufficient stats for each view - translate_dirs) # points will be moved like sample * target_score_cov + translate_dirs) # points will be moved like sample * cov_target_score pivot = np.mean((sample_stat + candidate <= observed_stat) * weights) / np.mean(weights) @@ -1307,9 +1310,9 @@ def naive_pvalues(diag_cov, observed, parameter): return pvalues def selective_MLE(observed_target, - target_cov, - target_score_cov, - init_soln, # initial (observed) value of + cov_target, + cov_target_score, + observed_soln, # initial (observed) value of # optimization variables -- used as a # feasible point. precise value used # only for independent estimator @@ -1330,11 +1333,11 @@ def selective_MLE(observed_target, ---------- observed_target : ndarray Observed estimate of target. - target_cov : ndarray + cov_target : ndarray Estimated covaraince of target. - target_score_cov : ndarray + cov_target_score : ndarray Estimated covariance of target and score of randomized query. - init_soln : ndarray + observed_soln : ndarray Feasible point for optimization problem. cond_mean : ndarray Conditional mean of optimization variables given target. @@ -1359,31 +1362,32 @@ def selective_MLE(observed_target, raise ValueError('no target specified') observed_target = np.atleast_1d(observed_target) - prec_target = np.linalg.inv(target_cov) + prec_target = np.linalg.inv(cov_target) prec_opt = np.linalg.inv(cond_cov) - # target_lin determines how the conditional mean of optimization variables + # regress_opt_target determines how the conditional mean of optimization variables # vary with target # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - score_decomp = target_score_cov.T.dot(prec_target) - score_resid = score_offset - score_decomp.dot(observed_target) + regress_score_target = cov_target_score.T.dot(prec_target) + resid_score_target = score_offset - regress_score_target.dot(observed_target) + + regress_opt_target = regress_opt.dot(regress_score_target) + resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) - target_lin = regress_opt.dot(score_decomp) - target_off = cond_mean - target_lin.dot(observed_target) if np.asarray(randomizer_prec).shape in [(), (0,)]: - _P = score_decomp.T.dot(score_resid) * randomizer_prec - _prec = prec_target + (score_decomp.T.dot(score_decomp) * randomizer_prec) - target_lin.T.dot(prec_opt).dot( - target_lin) + _P = regress_score_target.T.dot(resid_score_target) * randomizer_prec + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * randomizer_prec) - regress_opt_target.T.dot(prec_opt).dot( + regress_opt_target) else: - _P = score_decomp.T.dot(randomizer_prec).dot(score_resid) - _prec = prec_target + (score_decomp.T.dot(randomizer_prec).dot(score_decomp)) - target_lin.T.dot( - prec_opt).dot(target_lin) + _P = regress_score_target.T.dot(randomizer_prec).dot(resid_score_target) + prec_target_nosel = prec_target + (regress_score_target.T.dot(randomizer_prec).dot(regress_score_target)) - regress_opt_target.T.dot( + prec_opt).dot(regress_opt_target) - C = target_cov.dot(_P - target_lin.T.dot(prec_opt).dot(target_off)) + C = cov_target.dot(_P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) conjugate_arg = prec_opt.dot(cond_mean) @@ -1394,21 +1398,21 @@ def selective_MLE(observed_target, val, soln, hess = solver(conjugate_arg, prec_opt, - init_soln, + observed_soln, linear_part, offset, **solve_args) - final_estimator = target_cov.dot(_prec).dot(observed_target) \ - + target_cov.dot(target_lin.T.dot(prec_opt.dot(cond_mean - soln))) + C + final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ + + cov_target.dot(regress_opt_target.T.dot(prec_opt.dot(cond_mean - soln))) + C - unbiased_estimator = target_cov.dot(_prec).dot(observed_target) + target_cov.dot( - _P - target_lin.T.dot(prec_opt).dot(target_off)) + unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) + cov_target.dot( + _P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) - L = target_lin.T.dot(prec_opt) - observed_info_natural = _prec + L.dot(target_lin) - L.dot(hess.dot(L.T)) + L = regress_opt_target.T.dot(prec_opt) + observed_info_natural = prec_target_nosel + L.dot(regress_opt_target) - L.dot(hess.dot(L.T)) - observed_info_mean = target_cov.dot(observed_info_natural.dot(target_cov)) + observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) diff --git a/selectinf/randomized/screening.py b/selectinf/randomized/screening.py index b87ae0027..db6602cc4 100644 --- a/selectinf/randomized/screening.py +++ b/selectinf/randomized/screening.py @@ -108,9 +108,9 @@ def fit(self, perturb=None): opt_linear = np.zeros((p, self.num_opt_var)) opt_linear[self._selected] = np.diag(active_signs) - opt_offset = np.zeros(p) - opt_offset[self._selected] = active_signs * self.threshold[self._selected] - opt_offset[self._not_selected] = _randomized_score[self._not_selected] + observed_subgrad = np.zeros(p) + observed_subgrad[self._selected] = active_signs * self.threshold[self._selected] + observed_subgrad[self._not_selected] = _randomized_score[self._not_selected] self._setup = True @@ -120,7 +120,7 @@ def fit(self, perturb=None): self._setup_sampler(A_scaling, b_scaling, opt_linear, - opt_offset) + observed_subgrad) return self._selected @@ -211,9 +211,9 @@ def fit(self, perturb=None): for j in range(self.num_opt_var): opt_linear[selected_idx[j], j] = active_signs[j] - opt_offset = np.zeros(p) - opt_offset[self._selected] = active_signs * last_cutoff - opt_offset[self._not_selected] = _randomized_score[self._not_selected] + observed_subgrad = np.zeros(p) + observed_subgrad[self._selected] = active_signs * last_cutoff + observed_subgrad[self._not_selected] = _randomized_score[self._not_selected] self._setup = True @@ -223,7 +223,7 @@ def fit(self, perturb=None): self._setup_sampler(A_scaling, b_scaling, opt_linear, - opt_offset) + observed_subgrad) else: self._selected = np.zeros(p, np.bool) return self._selected @@ -328,7 +328,7 @@ def fit(self, perturb=None): opt_linear = np.zeros((p, self.num_opt_var)) opt_linear[self._selected] = np.diag(topK_signs) - opt_offset = np.zeros(p) + observed_subgrad = np.zeros(p) else: @@ -346,7 +346,7 @@ def fit(self, perturb=None): opt_linear = np.zeros((p, self.num_opt_var)) opt_linear[self._selected] = np.identity(self.num_opt_var) - opt_offset = np.zeros(p) + observed_subgrad = np.zeros(p) # in both cases, this conditioning means we just need to compute # the observed lower bound @@ -360,7 +360,7 @@ def fit(self, perturb=None): self._setup_sampler(A_scaling, b_scaling, opt_linear, - opt_offset) + observed_subgrad) return self._selected diff --git a/selectinf/randomized/slope.py b/selectinf/randomized/slope.py index 854148b54..5f88676e8 100644 --- a/selectinf/randomized/slope.py +++ b/selectinf/randomized/slope.py @@ -81,22 +81,22 @@ def _solve_randomized_problem(self, quad = rr.identity_quadratic(self.ridge_term, 0, -self._initial_omega, 0) problem = rr.simple_problem(self.loglike, self.penalty) - initial_soln = problem.solve(quad, **solve_args) - initial_subgrad = -(self.loglike.smooth_objective(initial_soln, 'grad') + - quad.objective(initial_soln, 'grad')) + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(self.loglike.smooth_objective(observed_soln, 'grad') + + quad.objective(observed_soln, 'grad')) - return initial_soln, initial_subgrad + return observed_soln, observed_subgrad def fit(self, solve_args={'tol': 1.e-12, 'min_its': 50}, perturb=None): - self.initial_soln, self.initial_subgrad = self._solve_randomized_problem(perturb=perturb, solve_args=solve_args) - p = self.initial_soln.shape[0] + self.observed_soln, self.observed_subgrad = self._solve_randomized_problem(perturb=perturb, solve_args=solve_args) + p = self.observed_soln.shape[0] # now we have to work out SLOPE details, clusters, etc. - active_signs = np.sign(self.initial_soln) + active_signs = np.sign(self.observed_soln) active = self._active = active_signs != 0 self._overall = overall = active> 0 @@ -107,9 +107,9 @@ def fit(self, 'variables': self._overall} - indices = np.argsort(-np.fabs(self.initial_soln)) - sorted_soln = self.initial_soln[indices] - initial_scalings = np.sort(np.unique(np.fabs(self.initial_soln[active])))[::-1] + indices = np.argsort(-np.fabs(self.observed_soln)) + sorted_soln = self.observed_soln[indices] + initial_scalings = np.sort(np.unique(np.fabs(self.observed_soln[active])))[::-1] self.observed_opt_state = initial_scalings self._unpenalized = np.zeros(p, np.bool) @@ -141,7 +141,7 @@ def fit(self, cur_indx = j + 1 sign_vec = np.zeros(p) sign_vec[np.arange(j + 1 - cur_indx_array[pointer]) + cur_indx_array[pointer]] = \ - np.sign(self.initial_soln[indices[np.arange(j + 1 - cur_indx_array[pointer]) + cur_indx_array[pointer]]]) + np.sign(self.observed_soln[indices[np.arange(j + 1 - cur_indx_array[pointer]) + cur_indx_array[pointer]]]) signs_cluster.append(sign_vec) pointer = pointer + 1 if sorted_soln[j + 1] == 0: @@ -156,7 +156,6 @@ def fit(self, _opt_linear_term = X.T.dot(X_clustered) _, prec = self.randomizer.cov_prec - opt_linear, opt_offset = (_opt_linear_term, self.initial_subgrad) # now make the constraints @@ -170,8 +169,8 @@ def fit(self, self._setup_sampler(A_scaling, b_scaling, - opt_linear, - opt_offset) + _opt_linear_term, + self.observed_subgrad) return active_signs From 175cae75b182bd470fcd7f7d99180abe1202c67a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 18:31:12 -0700 Subject: [PATCH 106/187] finished rename, and rewrite in terms of regression parameters for LASSO --- selectinf/randomized/approx_reference.py | 6 +- .../randomized/approx_reference_grouplasso.py | 30 ++-- selectinf/randomized/exact_reference.py | 6 +- selectinf/randomized/lasso.py | 34 +++- selectinf/randomized/posterior_inference.py | 14 +- selectinf/randomized/query.py | 152 ++++++++++-------- .../tests/test_selective_MLE_high.py | 4 +- 7 files changed, 149 insertions(+), 97 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 06eb5cd54..40e7363c4 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -56,7 +56,7 @@ def __init__(self, self.observed_soln = query.observed_opt_state - self.randomizer_prec = query.sampler.randomizer_prec + self.prec_randomizer = query.sampler.prec_randomizer self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] self.ntarget = ntarget = cov_target.shape[0] @@ -293,10 +293,10 @@ def _construct_density(self): regress_opt_target = self.regress_opt.dot(regress_score_target) resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( self.prec_opt).dot(regress_opt_target) - _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) _S = np.linalg.inv(_prec).dot(prec_target) diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index 3909a2a56..5d90e981b 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -75,7 +75,7 @@ def fit(self, tol = 1.e-20 - _, self.randomizer_prec = self.randomizer.cov_prec + _, self.prec_randomizer = self.randomizer.cov_prec # now we are collecting the directions and norms of the active groups for g in sorted(np.unique(self.groups)): # g is group label @@ -314,14 +314,14 @@ def selective_MLE(self, regress_opt_target = regress_opt.dot(regress_score_target) resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) - if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec - _prec = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + if np.asarray(self.prec_randomizer).shape in [(), (0,)]: + _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( prec_opt).dot( regress_opt_target) else: - _P = regress_score_target.T.dot(self.randomizer_prec).dot(resid_score_target) - _prec = prec_target + (regress_score_target.T.dot(self.randomizer_prec).dot(regress_score_target)) - regress_opt_target.T.dot( + _P = regress_score_target.T.dot(self.prec_randomizer).dot(resid_score_target) + prec_target_nosel = prec_target + (regress_score_target.T.dot(self.prec_randomizer).dot(regress_score_target)) - regress_opt_target.T.dot( prec_opt).dot(regress_opt_target) C = cov_target.dot(_P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) @@ -338,14 +338,14 @@ def selective_MLE(self, useJacobian, **solve_args) - final_estimator = cov_target.dot(_prec).dot(observed_target) \ + final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ + cov_target.dot(regress_opt_target.T.dot(prec_opt.dot(cond_mean - soln))) + C - unbiased_estimator = cov_target.dot(_prec).dot(observed_target) + cov_target.dot( + unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) + cov_target.dot( _P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) L = regress_opt_target.T.dot(prec_opt) - observed_info_natural = _prec + L.dot(regress_opt_target) - L.dot(hess.dot(L.T)) + observed_info_natural = prec_target_nosel + L.dot(regress_opt_target) - L.dot(hess.dot(L.T)) observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) @@ -448,7 +448,7 @@ def __init__(self, self.observed_soln = query.observed_opt_state - self.randomizer_prec = query.randomizer_prec + self.prec_randomizer = query.prec_randomizer self.score_offset = query.observed_score_state + query.observed_subgrad self.ntarget = ntarget = cov_target.shape[0] @@ -688,16 +688,16 @@ def _construct_density(self): regress_opt_target = self.regress_opt.dot(regress_score_target) resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - _prec = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( self.prec_opt).dot(regress_opt_target) - _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec - _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) - _S = np.linalg.inv(_prec).dot(prec_target) + _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer + _r = (1. / prec_target_nosel).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) + _S = np.linalg.inv(prec_target_nosel).dot(prec_target) S[m] = _S r[m] = _r - precs[m] = _prec + precs[m] = prec_target_nosel self.precs = precs self.S = S diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index fe7cc0885..018d19074 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -53,7 +53,7 @@ def __init__(self, self.observed_soln = query.observed_opt_state - self.randomizer_prec = query.sampler.randomizer_prec + self.prec_randomizer = query.sampler.prec_randomizer self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] self.ntarget = ntarget = cov_target.shape[0] @@ -305,10 +305,10 @@ def _construct_density(self): regress_opt_target = self.regress_opt.dot(regress_score_target) resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) - prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) - regress_opt_target.T.dot( + prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( self.prec_opt).dot(regress_opt_target) - _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) _S = np.linalg.inv(_prec).dot(prec_target) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 4936896b1..6b473cd56 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -164,10 +164,12 @@ def fit(self, X, y = self.loglike.data linpred = X.dot(beta_bar) n = linpred.shape[0] + if hasattr(self.loglike.saturated_loss, "hessian"): # a GLM -- all we need is W W = self._W = self.loglike.saturated_loss.hessian(linpred) _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) _hessian_unpen = np.dot(X.T, X[:, unpenalized] * W[:, None]) + _hessian = np.dot(X.T, X * W[:, None]) # CAREFUL -- this will be big elif hasattr(self.loglike.saturated_loss, "hessian_mult"): active_right = np.zeros((n, active.sum())) for i, j in enumerate(np.nonzero(active)[0]): @@ -181,6 +183,12 @@ def fit(self, case_weights=self.loglike.saturated_loss.case_weights) _hessian_active = X.T.dot(active_right) _hessian_unpen = X.T.dot(unpen_right) + _hessian = [] + for i in range(p): + _hessian.append(self.loglike.saturated_loss.hessian_mult(linpred, + X[:,i], + case_weights=self.loglike.saturated_loss.case_weights)) + _hessian = X.T.dot(np.array(_hessian).T) else: raise ValueError('saturated_loss has no hessian or hessian_mult method') @@ -238,6 +246,19 @@ def signed_basis_vector(p, j, s): b_scaling[:active.sum()], opt_linear, self.observed_subgrad) + + #### to be fixed -- set the cov_score here without dispersion + + self._cov_randomizer, prec = self.randomizer.cov_prec + self._prod_score_prec_unnorm = _hessian + + if np.asarray(prec).shape in [(), (0,)]: + self._prod_score_prec_unnorm *= prec + else: + self._prod_score_prec_unnorm = self._prod_score_prec_unnorm.dot(prec) + + ##### + if num_opt_var > 0: self._setup_sampler(*self._setup_sampler_data) @@ -721,7 +742,9 @@ def selected_targets(loglike, dispersion = ((y - loglike.saturated_loss.mean_function( Xfeat.dot(observed_target))) ** 2 / W).sum() / (n - Xfeat.shape[1]) - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, alternatives + regress_target_score = np.zeros((cov_target.shape[0], p)) + regress_target_score[:,features] = cov_target + return observed_target, cov_target * dispersion, regress_target_score, alternatives def full_targets(loglike, W, @@ -756,7 +779,8 @@ def full_targets(loglike, (n - p)) alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, alternatives + regress_target_score = Qfull_inv[features] # weights missing? + return observed_target, cov_target * dispersion, regress_target_score, alternatives def debiased_targets(loglike, W, @@ -811,7 +835,7 @@ def debiased_targets(loglike, (n - features.sum())) alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, alternatives + return observed_target, cov_target * dispersion, Qinv_hat, alternatives def form_targets(target, loglike, @@ -920,7 +944,9 @@ def _setup_implied_gaussian(self, regress_opt[:, ordered_vars] = -cond_cov * signs[None, :] / (dispersion * ratio) cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - return cond_mean, cond_cov, cond_precision, regress_opt + prod_score_prec = np.identity(self.nfeature) / ratio + + return cond_mean, cond_cov, cond_precision, regress_opt, prod_score_prec def _solve_randomized_problem(self, # optional binary vector diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index ea1d1fbf9..44f981561 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -41,7 +41,7 @@ def __init__(self, linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset regress_opt = query.sampler.logdens_transform[0] - _, randomizer_prec = query.randomizer.cov_prec + _, prec_randomizer = query.randomizer.cov_prec score_offset = query.observed_score_state + query.sampler.logdens_transform[1] result, self.inverse_info, log_ref = query.selective_MLE(observed_target, @@ -60,7 +60,7 @@ def __init__(self, self.observed_target = observed_target self.cov_target_score = cov_target_score self.regress_opt = regress_opt - self.randomizer_prec = randomizer_prec + self.prec_randomizer = prec_randomizer self.score_offset = score_offset self.feasible_point = query.observed_opt_state @@ -140,14 +140,14 @@ def _set_marginal_parameters(self): self.linear_coef = regress_opt_target self.offset_coef = resid_mean_opt_target - if np.asarray(self.randomizer_prec).shape in [(), (0,)]: - prec_target_nosel = self.prec_target + (regress_score_target.T.dot(regress_score_target) * self.randomizer_prec) \ + if np.asarray(self.prec_randomizer).shape in [(), (0,)]: + prec_target_nosel = self.prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) \ - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) - _P = regress_score_target.T.dot(resid_score_target) * self.randomizer_prec + _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer else: - prec_target_nosel = self.prec_target + (regress_score_target.T.dot(self.randomizer_prec).dot(regress_score_target)) \ + prec_target_nosel = self.prec_target + (regress_score_target.T.dot(self.prec_randomizer).dot(regress_score_target)) \ - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) - _P = regress_score_target.T.dot(self.randomizer_prec).dot(resid_score_target) + _P = regress_score_target.T.dot(self.prec_randomizer).dot(resid_score_target) _Q = np.linalg.inv(_prec + regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target)) self.prec_marginal = self.cond_precision - self.cond_precision.dot(regress_opt_target).dot(_Q).dot(regress_opt_target.T).dot(self.cond_precision) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 9859f693f..c284b59ec 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -118,9 +118,10 @@ def _setup_sampler(self, (cond_mean, cond_cov, cond_precision, - regress_opt) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion) + regress_opt, + prod_score_prec) = self._setup_implied_gaussian(opt_linear, + observed_subgrad, + dispersion) def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: @@ -135,9 +136,8 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad observed_subgrad, cond_precision) - - _, randomizer_prec = self.randomizer.cov_prec - self.cond_mean, self.cond_cov, self.randomizer_prec = cond_mean, cond_cov, randomizer_prec + cov_randomizer = self._cov_randomizer + self.cond_mean, self.cond_cov, self.cov_randomizer = cond_mean, cond_cov, cov_randomizer affine_con = constraints(A, b, @@ -150,7 +150,9 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad log_density, regress_opt, observed_subgrad, - self.randomizer_prec, + cov_randomizer, # \Sigma_{\omega} + opt_linear, # L + prod_score_prec, # \Sigma_S \Theta_{\omega} selection_info=self.selection_variable, useC=self.useC) @@ -164,6 +166,8 @@ def _setup_implied_gaussian(self, _, prec = self.randomizer.cov_prec prec = prec / dispersion + prod_score_prec = self._prod_score_prec_unnorm * dispersion # this is usually unnormalized by dispersion + if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec cond_cov = np.linalg.inv(cond_precision) @@ -172,17 +176,17 @@ def _setup_implied_gaussian(self, cond_precision = opt_linear.T.dot(prec.dot(opt_linear)) cond_cov = np.linalg.inv(cond_precision) regress_opt = -cond_cov.dot(opt_linear.T).dot(prec) - + # regress_opt is regression coefficient of opt onto score + u... cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - return cond_mean, cond_cov, cond_precision, regress_opt + return cond_mean, cond_cov, cond_precision, regress_opt, prod_score_prec def summary(self, observed_target, cov_target, - cov_target_score, + regress_target_score, alternatives, opt_sample=None, target_sample=None, @@ -200,8 +204,8 @@ def summary(self, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray - Estimated covariance of target and score of randomized query. + regress_target_score : ndarray + Estimated regression coefficient of target on score. alternatives : [str], optional Sequence of strings describing the alternatives, should be values of ['twosided', 'less', 'greater'] @@ -234,7 +238,7 @@ def summary(self, pivots = self.sampler.coefficient_pvalues(observed_target, cov_target, - cov_target_score, + regress_target_score, parameter=parameter, sample=(opt_sample, logW), normal_sample=target_sample, @@ -243,7 +247,7 @@ def summary(self, if not np.all(parameter == 0): pvalues = self.sampler.coefficient_pvalues(observed_target, cov_target, - cov_target_score, + regress_target_score, parameter=np.zeros_like(parameter), sample=(opt_sample, logW), normal_sample=target_sample, @@ -257,13 +261,13 @@ def summary(self, if compute_intervals: MLE = self.selective_MLE(observed_target, cov_target, - cov_target_score)[0] + regress_target_score)[0] MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) intervals = self.sampler.confidence_intervals( observed_target, cov_target, - cov_target_score, + regress_target_score, sample=(opt_sample, logW), normal_sample=target_sample, initial_guess=MLE_intervals, @@ -281,7 +285,7 @@ def summary(self, def selective_MLE(self, observed_target, cov_target, - cov_target_score, + regress_target_score, level=0.9, solve_args={'tol': 1.e-12}): """ @@ -291,7 +295,7 @@ def selective_MLE(self, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray + regress_target_score : ndarray Estimated covariance of target and score of randomized query. level : float, optional Confidence level. @@ -301,7 +305,7 @@ def selective_MLE(self, return self.sampler.selective_MLE(observed_target, cov_target, - cov_target_score, + regress_target_score, self.observed_opt_state, level=level, solve_args=solve_args) @@ -309,7 +313,7 @@ def selective_MLE(self, def posterior(self, observed_target, cov_target, - cov_target_score, + regress_target_score, prior=None, dispersion=None, solve_args={'tol': 1.e-12}): @@ -320,7 +324,7 @@ def posterior(self, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray + regress_target_score : ndarray Estimated covariance of target and score of randomized query. prior : callable A callable object that takes a single argument @@ -347,7 +351,7 @@ def prior(target_parameter): return posterior(self, observed_target, cov_target, - cov_target_score, + regress_target_score, prior, dispersion, solve_args=solve_args) @@ -355,7 +359,7 @@ def prior(target_parameter): def approximate_grid_inference(self, observed_target, cov_target, - cov_target_score, + regress_target_score, alternatives=None, solve_args={'tol': 1.e-12}): @@ -366,7 +370,7 @@ def approximate_grid_inference(self, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray + regress_target_score : ndarray Estimated covariance of target and score of randomized query. alternatives : [str], optional Sequence of strings describing the alternatives, @@ -378,7 +382,7 @@ def approximate_grid_inference(self, G = approximate_grid_inference(self, observed_target, cov_target, - cov_target_score, + regress_target_score, solve_args=solve_args) return G.summary(alternatives=alternatives) @@ -837,7 +841,9 @@ def __init__(self, log_cond_density, regress_opt, observed_subgrad, - randomizer_prec, + cov_randomizer, # \Sigma_{\omega} + opt_linear, # L + prod_score_prec, # \Sigma_S \Theta_{\omega} selection_info=None, useC=False): @@ -880,7 +886,9 @@ def __init__(self, self.regress_opt = regress_opt self.observed_subgrad = observed_subgrad self.useC = useC - self.randomizer_prec = randomizer_prec + self.cov_randomizer = cov_randomizer + self.opt_linear = opt_linear + self.prod_score_prec = prod_score_prec def log_cond_density(self, opt_sample, @@ -928,7 +936,7 @@ def sample(self, ndraw, burnin): def selective_MLE(self, observed_target, cov_target, - cov_target_score, + regress_target_score, # initial (observed) value of optimization variables -- # used as a feasible point. # precise value used only for independent estimator @@ -944,7 +952,7 @@ def selective_MLE(self, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray + regress_target_score : ndarray Estimated covariance of target and score of randomized query. observed_soln : ndarray Feasible point for optimization problem. @@ -954,19 +962,19 @@ def selective_MLE(self, Arguments passed to solver. """ - score_offset = self.observed_score_state + self.observed_subgrad - return selective_MLE(observed_target, cov_target, - cov_target_score, + regress_target_score, observed_soln, self.mean, self.covariance, self.regress_opt, self.affine_con.linear_part, self.affine_con.offset, - self.randomizer_prec, - score_offset, + self.cov_randomizer, + self.opt_linear, + self.prod_score_prec, + self.observed_score_state + self.observed_subgrad, solve_args=solve_args, level=level, useC=self.useC) @@ -1156,8 +1164,8 @@ def pivot(self, opt_sample, _, _, - cov_target_score) in self.opt_sampling_info: - cur_score_cov = linear_func.dot(cov_target_score) + regress_target_score) in self.opt_sampling_info: + cur_score_cov = linear_func.dot(regress_target_score) # cur_nuisance is in the view's score coordinates cur_nuisance = opt_sampler.observed_score_state - cur_score_cov * observed_stat / cov_target @@ -1167,7 +1175,7 @@ def pivot(self, weights = self._weights(sample_stat, # normal sample candidate, # candidate value nuisance, # nuisance sufficient stats for each view - translate_dirs) # points will be moved like sample * cov_target_score + translate_dirs) # points will be moved like sample * regress_target_score pivot = np.mean((sample_stat + candidate <= observed_stat) * weights) / np.mean(weights) @@ -1311,7 +1319,7 @@ def naive_pvalues(diag_cov, observed, parameter): def selective_MLE(observed_target, cov_target, - cov_target_score, + regress_target_score, observed_soln, # initial (observed) value of # optimization variables -- used as a # feasible point. precise value used @@ -1321,8 +1329,10 @@ def selective_MLE(observed_target, regress_opt, linear_part, offset, - randomizer_prec, - score_offset, + cov_randomizer, + opt_linear, + prod_score_prec, + observed_score, solve_args={'tol': 1.e-12}, level=0.9, useC=False): @@ -1335,8 +1345,8 @@ def selective_MLE(observed_target, Observed estimate of target. cov_target : ndarray Estimated covaraince of target. - cov_target_score : ndarray - Estimated covariance of target and score of randomized query. + regress_target_score : ndarray + Estimated regression coefficient of target on score. observed_soln : ndarray Feasible point for optimization problem. cond_mean : ndarray @@ -1371,23 +1381,37 @@ def selective_MLE(observed_target, # regress_opt determines how the argument of the optimization density # depends on the score, not how the mean depends on score, hence the minus sign - regress_score_target = cov_target_score.T.dot(prec_target) - resid_score_target = score_offset - regress_score_target.dot(observed_target) - - regress_opt_target = regress_opt.dot(regress_score_target) - resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) - - - if np.asarray(randomizer_prec).shape in [(), (0,)]: - _P = regress_score_target.T.dot(resid_score_target) * randomizer_prec - prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * randomizer_prec) - regress_opt_target.T.dot(prec_opt).dot( - regress_opt_target) - else: - _P = regress_score_target.T.dot(randomizer_prec).dot(resid_score_target) - prec_target_nosel = prec_target + (regress_score_target.T.dot(randomizer_prec).dot(regress_score_target)) - regress_opt_target.T.dot( - prec_opt).dot(regress_opt_target) - - C = cov_target.dot(_P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) + ## regress_score_target = cov_target_score.T.dot(prec_target) + ## resid_score_target = score_offset - regress_score_target.dot(observed_target) + + ## regress_opt_target = regress_opt.dot(regress_score_target) + ## resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) + + # M1, M2, M3 can be computed quickly (assumption) -- we can make this + # faster later + # shorthand + + M1 = prod_score_prec.dot(cov_randomizer).dot(prod_score_prec.T) + M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) + M3 = prod_score_prec + + # this is specific to target + + T1 = regress_target_score.T.dot(prec_target) + T2 = T1.T.dot(M1.dot(T1)) + T3 = T1.T.dot(M2.dot(T1)) + + prec_target_nosel = prec_target + T2 - T3 + _P = T1.T.dot(M3.dot(observed_score)) - T2.dot(observed_target) + + T4 = M3.T.dot(T1) + T5 = opt_linear.T.dot(T4) + T6 = cond_cov.dot(T5) + T7 = opt_linear.dot(T6) + T8 = M3.dot(T7) + T9 = T8.dot(observed_target) + M3.dot(opt_linear.dot(cond_mean)) + T10 = T1.T.dot(T9) + C = cov_target.dot(T10) conjugate_arg = prec_opt.dot(cond_mean) @@ -1403,14 +1427,16 @@ def selective_MLE(observed_target, offset, **solve_args) + T11 = regress_target_score.dot(M3.dot(opt_linear)) final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ - + cov_target.dot(regress_opt_target.T.dot(prec_opt.dot(cond_mean - soln))) + C + + T11.dot(cond_mean - soln) + C + T12 = prec_target.dot(T11) + T13 = T3 unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) + cov_target.dot( - _P - regress_opt_target.T.dot(prec_opt).dot(resid_mean_opt_target)) + _P - T12.dot(cond_mean) + T13.dot(observed_target)) - L = regress_opt_target.T.dot(prec_opt) - observed_info_natural = prec_target_nosel + L.dot(regress_opt_target) - L.dot(hess.dot(L.T)) + observed_info_natural = prec_target_nosel + T3 - T12.dot(hess.dot(T12.T)) observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index b133735f6..da592da87 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -428,7 +428,7 @@ def test_cox(n=2000, if nonzero.sum() > 0: cox_full = rr.glm.cox(X, T, S) - full_hess = cox_full.hessian(conv.initial_soln) + full_hess = cox_full.hessian(conv.observed_soln) (observed_target, cov_target, @@ -488,7 +488,7 @@ def test_cox_split(n=2000, if nonzero.sum() > 0: cox_full = rr.glm.cox(X, T, S) - full_hess = cox_full.hessian(conv.initial_soln) + full_hess = cox_full.hessian(conv.observed_soln) (observed_target, cov_target, From 6b93957974680857ec2cd710987daa717cab7e76 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 18:31:44 -0700 Subject: [PATCH 107/187] doc describing rename --- doc/Gaussian queries.Rmd | 167 +++++++++++++++++++++++++++++ doc/Gaussian queries.ipynb | 209 +++++++++++++++++++++++++++++++++++++ 2 files changed, 376 insertions(+) create mode 100644 doc/Gaussian queries.Rmd create mode 100644 doc/Gaussian queries.ipynb diff --git a/doc/Gaussian queries.Rmd b/doc/Gaussian queries.Rmd new file mode 100644 index 000000000..e86125149 --- /dev/null +++ b/doc/Gaussian queries.Rmd @@ -0,0 +1,167 @@ +--- +jupyter: + jupytext: + formats: ipynb,Rmd + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.10.2 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +## KKT conditions + +$$ +\omega = \nabla \ell(o) + u + \epsilon o. +$$ + +## Current terms used in selective MLE + +- `observed_score_state`: for LASSO this is $S=-X^TY$ (and for any linear regression), in general it should be +$\nabla \ell(\beta^*) - Q(\beta^*)\beta^*$, call this $A$ + +- `opt_offset`: this is $\hat{u}$ or (changed everywhere to `observed_subgrad`) + +- `opt_linear`: this is $\nabla^2 \ell(\hat{\beta}) + \epsilon I$ restricted to "selected" subspace, call this $L$ + +## Rewrite of KKT + +$$ +\omega = Lo + S + u. +$$ + +## More terms in the code + +- Randomization precision `randomizer_prec` call this $\Theta_{\omega}=\Sigma_{\omega}^{-1}$ so $\omega \sim N(0, \Theta^{-1})$. + +- `cond_cov`= $\Sigma_{o|S,u}$, `cond_mean`, `cond_precision`=$\Sigma_{o|S,u}^{-1}=\Theta_{o|S,u}$: +describe implied law of $o|S,u$. These are computed in `_setup_implied_gaussian`. Specifically, we have + +$$ +\begin{aligned} +\Sigma_{o|S,u} = (L^T\Theta L)^{-1} +\end{aligned} +$$ + +- `regress_opt` (formerly `logdens_linear`) call this $A$: this is the regression of $o$ onto $S+u$, in the implied +Gaussian given $u,S$ i.e. + +$$ +E[o|S,u] = A(S+u) = -\Sigma_{o|S,u} L^T \Theta_{\omega}(S+u). +$$ + +- `cond_mean` is the conditional mean of $o|S,u$ evaluated at observed $S,u$: $A(S+u)_{obs}$. Or, `regress_opt_score(observed_score_state + observed_subgrad)` + + +## Target related + +- `observed_target, target_cov, target_prec`: not much explanation needed $\hat{\theta}, \Sigma_{\hat{\theta}}, \Theta_{\hat{\theta}} = \Sigma_{\hat{\theta}}^{-1}$ + +- `target_score_cov`: $\Sigma_{\hat{\theta},S}$ + +- `regress_target`: regression of target onto score, formally this would be $\Sigma_{\hat{\theta},S}\Theta_S $ (transpose of usual way of writing regression, not in code yet), let's call it $B$ for now + +- `cov_product`: $\Sigma_S \Theta_{\omega}$: product of score covariance and randomization precision. + +- `cov_score`: $\Sigma_S$ + +- `score_offset = observed_score_state + observed_subgrad`=$S+u$ + +### In `selective_MLE` + +- `target_linear`: $\Sigma_{S,\hat{\theta}}\Theta_{\hat{\theta}}= \Sigma_S B^T\Theta_{\hat{\theta}}$ (changed name to `regress_score_target`) + +- `target_offset`: $S+u-\Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta} = S+u - \Sigma_{S,\hat{\theta}} \Theta_{\hat{\theta}} \hat{\theta}$ (changed name to `resid_score_target`) + +- `target_lin`: $A\Sigma_S B^T \Theta_{\hat{\theta}} = -(L^T\Theta_{\omega}L)^{-1} L^T\Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}}$ (changed name to `regress_opt_target` + +- `target_off`: $A(S+u - \Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta})$ `resid_opt_target` + +- `_P`: $\Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} (S+u-\Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta}) = \Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} (S+u) - \Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta} = \Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} (S+u) - \Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} \Sigma_{\omega} \Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta} $. +Let's call `_P` $\xi$ + +- `_prec`: $\Theta_{\hat{\theta}} + \Theta_{\hat{\theta}} B\Sigma_S \Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}} +- \Theta_{\hat{\theta}} B \Sigma_S A^T \Theta_{o|S,u} A \Sigma_S B^T \Theta_{\hat{\theta}}$ + +- `C`: something that can be computed with all of the above... I guess (but am not sure) that `_prec` is +the precision of the (best case, no-selection) unbiased estimate of our target when we condition on $N,u$ + +- More precisely, + +$$ +\begin{aligned} +\Theta_{\hat{\theta}} C &= \xi + (A\Sigma_S B^T \Theta_{\hat{\theta}})^T L^T \Theta_{\omega} L (A\Sigma_S B^T \Theta_{\hat{\theta}})^T \hat{\theta} - (A\Sigma_S B^T \Theta_{\hat{\theta}})^T L^T \Theta_{\omega} L A(S+u) \\ +&= \xi + \Theta_{\hat{\theta}}B \left(\Sigma_S A^T L^T\Theta_{\omega} L A \Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta} - \Sigma_S A^T L^T\Theta_{\omega} L A(S+u) \right) \\ +&= \xi + \Theta_{\hat{\theta}}B \left(\Sigma_S \Theta_{\omega} L (L^T\Theta_{\omega} L)^{-1} L^T \Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}} \hat{\theta} + \Sigma_S \Theta_{\omega}L A(S+u) \right) \\ +\end{aligned} +$$ + +The expression $A(S+u)$ is `cond_mean` and the other term can be computed straightforwardly. We've used the fact +$$ +A\Sigma_S = -\Sigma_{o|S,u}L^T\Theta_{\omega} \Sigma_S =- (L^T\Theta_{\omega}L)^{-1}L^T\Theta_{\omega}\Sigma_S +$$ + + + + + +- Don't know what to sensibly call the last three things... but `_P` and `_prec` are the arguments to the +optimization problem so these are what needs computing. I did change `_prec` to `prec_target_nosel` + +- `cov_target.dot(regress_opt_target.T.dot(prec_opt))`. This is + +$$-\Sigma_{\hat{\theta}} \Theta_{\hat{\theta}}B \Sigma_S\Theta_{\omega} L (L^T\Theta_{\omega}L)^{-1} (L^T\Theta_{\omega} L) = B \Sigma_S\Theta_{\omega} L$$ + +- `regress_opt_target.T.dot(prec_opt)`. This is + +$$-\Theta_{\hat{\theta}}B \Sigma_S\Theta_{\omega} L (L^T\Theta_{\omega}L)^{-1} (L^T\Theta_{\omega} L) = \Theta_{\hat{\theta}} B \Sigma_S\Theta_{\omega} L$$ + +- `regress_opt_target.T.dot(prec_opt).dot(regress_opt_target)`: This is + +$$ +\Theta_{\hat{\theta}}B \Sigma_S\Theta_{\omega} L (L^T\Theta_{\omega}L)^{-1} L^T\Theta_{\omega} \Sigma_S B^T \Theta_{\hat{\theta}} +$$ + + +### Computational considerations? + + +#### Case 1: $\Theta_{\omega}^{1/2}$ is known + + +Another potential downside to all this is that these matrices will generally be $p \times p$. I think in `price_of_selection` I had written some way of doing part of this without having to form all of these matrices +explicitly. However, the difference of the last two matrices in `_prec` can be computed (if we know $\Sigma_{\omega}^{\pm 1/2}$ as identity minus rank $E$ matrix I think and +$$ +A^T\Sigma_{o|S,u}A = \Theta_{\omega} L^T \Sigma_{o|S,u} L \Theta_{\omega} +$$ +so we want to compute +$$ +\Theta_{\omega} - \Theta_{\omega} L^T \Sigma_{o|S,u} L \Theta_{\omega} = \Theta_{\omega}^{1/2}(P - \Theta_{\omega}^{1/2}L^T (L^T\Theta_{\omega} L)^{-1} L\Theta_{\omega}^{1/2}) \Theta_{\omega}^{1/2} +$$ +with $P$ projection onto $\text{row}(\Sigma_{\omega})$. So we need to compute projection on to a $E$-dimensional +subspace of $\text{row}(\Sigma_{\omega})$. Morally, this makes sense even if $\Sigma_{\omega}$ is not full rank but seems a little sketchy. + +We might also try computing +$$ +\begin{aligned} +\Sigma_S\Theta_{\omega}\Sigma_S - \Sigma_S\Theta_{\omega} L^T \Sigma_{o|S,u} L \Theta_{\omega} \Sigma_S &= \Sigma_S \Theta_{\omega}^{1/2}(P - \Theta_{\omega}^{1/2}L^T (L^T\Theta_{\omega} L)^{-1} L\Theta_{\omega}^{1/2}) \Theta_{\omega}^{1/2} \Sigma_S \\ +&= \Sigma_S \Theta_{\omega} \Theta_{\omega}^{-1/2}(P - \Theta_{\omega}^{1/2}L^T (L^T\Theta_{\omega} L)^{-1} L\Theta_{\omega}^{1/2}) \Theta_{\omega}^{-1/2} \Theta_{\omega} \Sigma_S \\ +&= \Sigma_S \Theta_{\omega} \Sigma_{\omega}^{1/2}(P - \Theta_{\omega}^{1/2}L^T (L^T\Theta_{\omega} L)^{-1} L\Theta_{\omega}^{1/2}) \Sigma_{\omega}^{1/2} \Theta_{\omega} \Sigma_S \\ +&= \Sigma_S \Theta_{\omega} (\Sigma_{\omega} - PL^T (L^T\Theta_{\omega} L)^{-1} LP) \Theta_{\omega} \Sigma_S \\ +&= \Sigma_S \Theta_{\omega} (\Sigma_{\omega} - L^T (L^T\Theta_{\omega} L)^{-1} L) \Theta_{\omega} \Sigma_S \\ +\end{aligned} +$$ + +So, to compute `_prec` we need to compute this above matrix and apply it to $B\Theta_{\hat{\theta}}$. **If we suppose that $\Sigma_{\omega}$ and $\Sigma_S \Theta_{\omega}$ can be computed without $p^2$ memory then we only +have to store $L$ and $(L^T\Theta_{\omega}L)^{-1}$.** We are already storing $(L^T\Theta_{\omega}L)^{-1}$ as the conditional covariance in the affine constraint. + +This matrix might be easier to compute for both data splitting and general case (when we know $\Sigma_{\omega}$). + + + + +In order to compute `_P` suppose wee have stored $PL^T(L^T\Theta_{\omega}L)^{-1}LP$ as well as diff --git a/doc/Gaussian queries.ipynb b/doc/Gaussian queries.ipynb new file mode 100644 index 000000000..84788d447 --- /dev/null +++ b/doc/Gaussian queries.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## KKT conditions\n", + "\n", + "$$\n", + "\\omega = \\nabla \\ell(o) + u + \\epsilon o.\n", + "$$\n", + "\n", + "## Current terms used in selective MLE\n", + "\n", + "- `observed_score_state`: for LASSO this is $S=-X^TY$ (and for any linear regression), in general it should be\n", + "$\\nabla \\ell(\\beta^*) - Q(\\beta^*)\\beta^*$, call this $A$\n", + "\n", + "- `opt_offset`: this is $\\hat{u}$ or (changed everywhere to `observed_subgrad`)\n", + "\n", + "- `opt_linear`: this is $\\nabla^2 \\ell(\\hat{\\beta}) + \\epsilon I$ restricted to \"selected\" subspace, call this $L$\n", + "\n", + "## Rewrite of KKT\n", + "\n", + "$$\n", + "\\omega = Lo + S + u.\n", + "$$\n", + "\n", + "## More terms in the code\n", + "\n", + "- Randomization precision `randomizer_prec` call this $\\Theta_{\\omega}=\\Sigma_{\\omega}^{-1}$ so $\\omega \\sim N(0, \\Theta^{-1})$.\n", + "\n", + "- `cond_cov`= $\\Sigma_{o|S,u}$, `cond_mean`, `cond_precision`=$\\Sigma_{o|S,u}^{-1}=\\Theta_{o|S,u}$:\n", + "describe implied law of $o|S,u$. These are computed in `_setup_implied_gaussian`. Specifically, we have\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "\\Sigma_{o|S,u} = (L^T\\Theta L)^{-1}\n", + "\\end{aligned}\n", + "$$\n", + "\n", + "- `regress_opt` (formerly `logdens_linear`) call this $A$: this is the regression of $o$ onto $S+u$, in the implied\n", + "Gaussian given $u,S$ i.e.\n", + "\n", + "$$\n", + "E[o|S,u] = A(S+u) = -\\Sigma_{o|S,u} L^T \\Theta_{\\omega}(S+u).\n", + "$$\n", + "\n", + "- `cond_mean` is the conditional mean of $o|S,u$ evaluated at observed $S,u$: $A(S+u)_{obs}$. Or, `regress_opt_score(observed_score_state + observed_subgrad)`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Target related\n", + "\n", + "- `observed_target, target_cov, target_prec`: not much explanation needed $\\hat{\\theta}, \\Sigma_{\\hat{\\theta}}, \\Theta_{\\hat{\\theta}} = \\Sigma_{\\hat{\\theta}}^{-1}$\n", + "\n", + "- `target_score_cov`: $\\Sigma_{\\hat{\\theta},S}$\n", + "\n", + "- `regress_target`: regression of target onto score, formally this would be $\\Sigma_{\\hat{\\theta},S}\\Theta_S $ (transpose of usual way of writing regression, not in code yet), let's call it $B$ for now\n", + "\n", + "- `cov_product`: $\\Sigma_S \\Theta_{\\omega}$: product of score covariance and randomization precision.\n", + "\n", + "- `cov_score`: $\\Sigma_S$\n", + "\n", + "- `score_offset = observed_score_state + observed_subgrad`=$S+u$\n", + "\n", + "### In `selective_MLE`\n", + "\n", + "- `target_linear`: $\\Sigma_{S,\\hat{\\theta}}\\Theta_{\\hat{\\theta}}= \\Sigma_S B^T\\Theta_{\\hat{\\theta}}$ (changed name to `regress_score_target`)\n", + "\n", + "- `target_offset`: $S+u-\\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta} = S+u - \\Sigma_{S,\\hat{\\theta}} \\Theta_{\\hat{\\theta}} \\hat{\\theta}$ (changed name to `resid_score_target`)\n", + "\n", + "- `target_lin`: $A\\Sigma_S B^T \\Theta_{\\hat{\\theta}} = -(L^T\\Theta_{\\omega}L)^{-1} L^T\\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}}$ (changed name to `regress_opt_target`\n", + "\n", + "- `target_off`: $A(S+u - \\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta})$ `resid_opt_target`\n", + "\n", + "- `_P`: $\\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} (S+u-\\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta}) = \\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} (S+u) - \\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta} = \\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} (S+u) - \\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} \\Sigma_{\\omega} \\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta} $.\n", + "Let's call `_P` $\\xi$\n", + "\n", + "- `_prec`: $\\Theta_{\\hat{\\theta}} + \\Theta_{\\hat{\\theta}} B\\Sigma_S \\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}}\n", + "- \\Theta_{\\hat{\\theta}} B \\Sigma_S A^T \\Theta_{o|S,u} A \\Sigma_S B^T \\Theta_{\\hat{\\theta}}$\n", + "\n", + "- `C`: something that can be computed with all of the above... I guess (but am not sure) that `_prec` is \n", + "the precision of the (best case, no-selection) unbiased estimate of our target when we condition on $N,u$ \n", + "\n", + "- More precisely,\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "\\Theta_{\\hat{\\theta}} C &= \\xi + (A\\Sigma_S B^T \\Theta_{\\hat{\\theta}})^T L^T \\Theta_{\\omega} L (A\\Sigma_S B^T \\Theta_{\\hat{\\theta}})^T \\hat{\\theta} - (A\\Sigma_S B^T \\Theta_{\\hat{\\theta}})^T L^T \\Theta_{\\omega} L A(S+u) \\\\\n", + "&= \\xi + \\Theta_{\\hat{\\theta}}B \\left(\\Sigma_S A^T L^T\\Theta_{\\omega} L A \\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta} - \\Sigma_S A^T L^T\\Theta_{\\omega} L A(S+u) \\right) \\\\\n", + "&= \\xi + \\Theta_{\\hat{\\theta}}B \\left(\\Sigma_S \\Theta_{\\omega} L (L^T\\Theta_{\\omega} L)^{-1} L^T \\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}} \\hat{\\theta} + \\Sigma_S \\Theta_{\\omega}L A(S+u) \\right) \\\\\n", + "\\end{aligned}\n", + "$$\n", + "\n", + "The expression $A(S+u)$ is `cond_mean` and the other term can be computed straightforwardly. We've used the fact\n", + "$$\n", + "A\\Sigma_S = -\\Sigma_{o|S,u}L^T\\Theta_{\\omega} \\Sigma_S =- (L^T\\Theta_{\\omega}L)^{-1}L^T\\Theta_{\\omega}\\Sigma_S\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "- Don't know what to sensibly call the last three things... but `_P` and `_prec` are the arguments to the\n", + "optimization problem so these are what needs computing. I did change `_prec` to `prec_target_nosel`\n", + "\n", + "- `cov_target.dot(regress_opt_target.T.dot(prec_opt))`. This is\n", + "\n", + "$$-\\Sigma_{\\hat{\\theta}} \\Theta_{\\hat{\\theta}}B \\Sigma_S\\Theta_{\\omega} L (L^T\\Theta_{\\omega}L)^{-1} (L^T\\Theta_{\\omega} L) = B \\Sigma_S\\Theta_{\\omega} L$$\n", + "\n", + "- `regress_opt_target.T.dot(prec_opt)`. This is\n", + "\n", + "$$-\\Theta_{\\hat{\\theta}}B \\Sigma_S\\Theta_{\\omega} L (L^T\\Theta_{\\omega}L)^{-1} (L^T\\Theta_{\\omega} L) = \\Theta_{\\hat{\\theta}} B \\Sigma_S\\Theta_{\\omega} L$$\n", + "\n", + "- `regress_opt_target.T.dot(prec_opt).dot(regress_opt_target)`: This is\n", + "\n", + "$$\n", + "\\Theta_{\\hat{\\theta}}B \\Sigma_S\\Theta_{\\omega} L (L^T\\Theta_{\\omega}L)^{-1} L^T\\Theta_{\\omega} \\Sigma_S B^T \\Theta_{\\hat{\\theta}}\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computational considerations?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Case 1: $\\Theta_{\\omega}^{1/2}$ is known" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another potential downside to all this is that these matrices will generally be $p \\times p$. I think in `price_of_selection` I had written some way of doing part of this without having to form all of these matrices\n", + "explicitly. However, the difference of the last two matrices in `_prec` can be computed (if we know $\\Sigma_{\\omega}^{\\pm 1/2}$ as identity minus rank $E$ matrix I think and\n", + "$$\n", + "A^T\\Sigma_{o|S,u}A = \\Theta_{\\omega} L^T \\Sigma_{o|S,u} L \\Theta_{\\omega}\n", + "$$\n", + "so we want to compute\n", + "$$\n", + "\\Theta_{\\omega} - \\Theta_{\\omega} L^T \\Sigma_{o|S,u} L \\Theta_{\\omega} = \\Theta_{\\omega}^{1/2}(P - \\Theta_{\\omega}^{1/2}L^T (L^T\\Theta_{\\omega} L)^{-1} L\\Theta_{\\omega}^{1/2}) \\Theta_{\\omega}^{1/2}\n", + "$$\n", + "with $P$ projection onto $\\text{row}(\\Sigma_{\\omega})$. So we need to compute projection on to a $E$-dimensional\n", + "subspace of $\\text{row}(\\Sigma_{\\omega})$. Morally, this makes sense even if $\\Sigma_{\\omega}$ is not full rank but seems a little sketchy.\n", + "\n", + "We might also try computing\n", + "$$\n", + "\\begin{aligned}\n", + "\\Sigma_S\\Theta_{\\omega}\\Sigma_S - \\Sigma_S\\Theta_{\\omega} L^T \\Sigma_{o|S,u} L \\Theta_{\\omega} \\Sigma_S &= \\Sigma_S \\Theta_{\\omega}^{1/2}(P - \\Theta_{\\omega}^{1/2}L^T (L^T\\Theta_{\\omega} L)^{-1} L\\Theta_{\\omega}^{1/2}) \\Theta_{\\omega}^{1/2} \\Sigma_S \\\\\n", + "&= \\Sigma_S \\Theta_{\\omega} \\Theta_{\\omega}^{-1/2}(P - \\Theta_{\\omega}^{1/2}L^T (L^T\\Theta_{\\omega} L)^{-1} L\\Theta_{\\omega}^{1/2}) \\Theta_{\\omega}^{-1/2} \\Theta_{\\omega} \\Sigma_S \\\\\n", + "&= \\Sigma_S \\Theta_{\\omega} \\Sigma_{\\omega}^{1/2}(P - \\Theta_{\\omega}^{1/2}L^T (L^T\\Theta_{\\omega} L)^{-1} L\\Theta_{\\omega}^{1/2}) \\Sigma_{\\omega}^{1/2} \\Theta_{\\omega} \\Sigma_S \\\\\n", + "&= \\Sigma_S \\Theta_{\\omega} (\\Sigma_{\\omega} - PL^T (L^T\\Theta_{\\omega} L)^{-1} LP) \\Theta_{\\omega} \\Sigma_S \\\\\n", + "&= \\Sigma_S \\Theta_{\\omega} (\\Sigma_{\\omega} - L^T (L^T\\Theta_{\\omega} L)^{-1} L) \\Theta_{\\omega} \\Sigma_S \\\\\n", + "\\end{aligned}\n", + "$$\n", + "\n", + "So, to compute `_prec` we need to compute this above matrix and apply it to $B\\Theta_{\\hat{\\theta}}$. **If we suppose that $\\Sigma_{\\omega}$ and $\\Sigma_S \\Theta_{\\omega}$ can be computed without $p^2$ memory then we only\n", + "have to store $L$ and $(L^T\\Theta_{\\omega}L)^{-1}$.** We are already storing $(L^T\\Theta_{\\omega}L)^{-1}$ as the conditional covariance in the affine constraint.\n", + "\n", + "This matrix might be easier to compute for both data splitting and general case (when we know $\\Sigma_{\\omega}$).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to compute `_P` suppose wee have stored $PL^T(L^T\\Theta_{\\omega}L)^{-1}LP$ as well as " + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From dd4597a563392c1bf4e9414fe41ce6f6ab42b314 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 20:02:49 -0700 Subject: [PATCH 108/187] small comment --- selectinf/randomized/query.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index c284b59ec..8072cd75f 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1391,6 +1391,8 @@ def selective_MLE(observed_target, # faster later # shorthand + # these could be done by the query at `fit` time + M1 = prod_score_prec.dot(cov_randomizer).dot(prod_score_prec.T) M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) M3 = prod_score_prec From 82cb60d4f17046991b3c83ecd0fdbb1700e2a53a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 12 Jul 2021 23:38:43 -0700 Subject: [PATCH 109/187] computing M1, M2, M3 within query, so data splitting runs now --- selectinf/randomized/lasso.py | 21 ++++++----- selectinf/randomized/query.py | 67 +++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 35 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 6b473cd56..397fce7ef 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -249,13 +249,7 @@ def signed_basis_vector(p, j, s): #### to be fixed -- set the cov_score here without dispersion - self._cov_randomizer, prec = self.randomizer.cov_prec - self._prod_score_prec_unnorm = _hessian - - if np.asarray(prec).shape in [(), (0,)]: - self._prod_score_prec_unnorm *= prec - else: - self._prod_score_prec_unnorm = self._prod_score_prec_unnorm.dot(prec) + self._hessian = _hessian ##### @@ -946,7 +940,18 @@ def _setup_implied_gaussian(self, prod_score_prec = np.identity(self.nfeature) / ratio - return cond_mean, cond_cov, cond_precision, regress_opt, prod_score_prec + cov_rand = self._hessian * dispersion + M1 = prod_score_prec.dot(cov_rand).dot(prod_score_prec.T) + M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) + M3 = prod_score_prec + + return (cond_mean, + cond_cov, + cond_precision, + regress_opt, + M1, + M2, + M3) def _solve_randomized_problem(self, # optional binary vector diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 8072cd75f..dc59f6d9e 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -119,9 +119,11 @@ def _setup_sampler(self, cond_cov, cond_precision, regress_opt, - prod_score_prec) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion) + M1, + M2, + M3) = self._setup_implied_gaussian(opt_linear, + observed_subgrad, + dispersion) def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: @@ -136,8 +138,7 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad observed_subgrad, cond_precision) - cov_randomizer = self._cov_randomizer - self.cond_mean, self.cond_cov, self.cov_randomizer = cond_mean, cond_cov, cov_randomizer + self.cond_mean, self.cond_cov = cond_mean, cond_cov affine_con = constraints(A, b, @@ -148,11 +149,12 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad self.observed_opt_state, self.observed_score_state, log_density, - regress_opt, + regress_opt, # not needed? observed_subgrad, - cov_randomizer, # \Sigma_{\omega} opt_linear, # L - prod_score_prec, # \Sigma_S \Theta_{\omega} + M1, + M2, + M3, selection_info=self.selection_variable, useC=self.useC) @@ -163,10 +165,15 @@ def _setup_implied_gaussian(self, # for covariance of randomization dispersion=1): - _, prec = self.randomizer.cov_prec - prec = prec / dispersion + cov_rand, prec = self.randomizer.cov_prec + prec = prec / dispersion # why do we do this here -- prec is just known - prod_score_prec = self._prod_score_prec_unnorm * dispersion # this is usually unnormalized by dispersion + if np.asarray(prec).shape in [(), (0,)]: + _prod_score_prec_unnorm = self._hessian * prec + else: + _prod_score_prec_unnorm = self._hessian.dot(prec) + + prod_score_prec = _prod_score_prec_unnorm * dispersion if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec @@ -181,7 +188,17 @@ def _setup_implied_gaussian(self, cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - return cond_mean, cond_cov, cond_precision, regress_opt, prod_score_prec + M1 = prod_score_prec.dot(cov_rand).dot(prod_score_prec.T) + M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) + M3 = prod_score_prec + + return (cond_mean, + cond_cov, + cond_precision, + regress_opt, + M1, + M2, + M3) def summary(self, observed_target, @@ -841,9 +858,10 @@ def __init__(self, log_cond_density, regress_opt, observed_subgrad, - cov_randomizer, # \Sigma_{\omega} - opt_linear, # L - prod_score_prec, # \Sigma_S \Theta_{\omega} + opt_linear, + M1, + M2, + M3, selection_info=None, useC=False): @@ -886,9 +904,8 @@ def __init__(self, self.regress_opt = regress_opt self.observed_subgrad = observed_subgrad self.useC = useC - self.cov_randomizer = cov_randomizer self.opt_linear = opt_linear - self.prod_score_prec = prod_score_prec + self.M1, self.M2, self.M3 = M1, M2, M3 def log_cond_density(self, opt_sample, @@ -968,12 +985,12 @@ def selective_MLE(self, observed_soln, self.mean, self.covariance, - self.regress_opt, self.affine_con.linear_part, self.affine_con.offset, - self.cov_randomizer, self.opt_linear, - self.prod_score_prec, + self.M1, + self.M2, + self.M3, self.observed_score_state + self.observed_subgrad, solve_args=solve_args, level=level, @@ -1326,12 +1343,12 @@ def selective_MLE(observed_target, # only for independent estimator cond_mean, cond_cov, - regress_opt, linear_part, offset, - cov_randomizer, opt_linear, - prod_score_prec, + M1, + M2, + M3, observed_score, solve_args={'tol': 1.e-12}, level=0.9, @@ -1393,10 +1410,6 @@ def selective_MLE(observed_target, # these could be done by the query at `fit` time - M1 = prod_score_prec.dot(cov_randomizer).dot(prod_score_prec.T) - M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) - M3 = prod_score_prec - # this is specific to target T1 = regress_target_score.T.dot(prec_target) From d91f463c211df75246e9806383c82f7ac2fae32f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 13 Jul 2021 00:15:25 -0700 Subject: [PATCH 110/187] update doc --- doc/Gaussian queries.Rmd | 16 +++++++------- doc/Gaussian queries.ipynb | 21 +++++++++---------- selectinf/randomized/lasso.py | 7 ++++--- selectinf/randomized/query.py | 39 ++++++++++------------------------- 4 files changed, 34 insertions(+), 49 deletions(-) diff --git a/doc/Gaussian queries.Rmd b/doc/Gaussian queries.Rmd index e86125149..3dfb026f8 100644 --- a/doc/Gaussian queries.Rmd +++ b/doc/Gaussian queries.Rmd @@ -156,12 +156,14 @@ $$ \end{aligned} $$ -So, to compute `_prec` we need to compute this above matrix and apply it to $B\Theta_{\hat{\theta}}$. **If we suppose that $\Sigma_{\omega}$ and $\Sigma_S \Theta_{\omega}$ can be computed without $p^2$ memory then we only -have to store $L$ and $(L^T\Theta_{\omega}L)^{-1}$.** We are already storing $(L^T\Theta_{\omega}L)^{-1}$ as the conditional covariance in the affine constraint. +## Three matrices -This matrix might be easier to compute for both data splitting and general case (when we know $\Sigma_{\omega}$). +- All the computations above can be expressed of some target specific info like $B, \Theta_{\hat{\theta}}, \Sigma_{\hat{\theta}}, \hat{\theta}$ and - - - -In order to compute `_P` suppose wee have stored $PL^T(L^T\Theta_{\omega}L)^{-1}LP$ as well as +$$ +\begin{aligned} +M_1 &= \Sigma_S \Theta_{\omega} \\ +M_2 &= M_1 \Sigma_{\omega} M_1^T \\ +M_3 &= M_1 L (L^T\Sigma_{\omega}L)^{-1} L M_1^T +\end{aligned} +$$ \ No newline at end of file diff --git a/doc/Gaussian queries.ipynb b/doc/Gaussian queries.ipynb index 84788d447..89d0cbc46 100644 --- a/doc/Gaussian queries.ipynb +++ b/doc/Gaussian queries.ipynb @@ -167,18 +167,17 @@ "\\end{aligned}\n", "$$\n", "\n", - "So, to compute `_prec` we need to compute this above matrix and apply it to $B\\Theta_{\\hat{\\theta}}$. **If we suppose that $\\Sigma_{\\omega}$ and $\\Sigma_S \\Theta_{\\omega}$ can be computed without $p^2$ memory then we only\n", - "have to store $L$ and $(L^T\\Theta_{\\omega}L)^{-1}$.** We are already storing $(L^T\\Theta_{\\omega}L)^{-1}$ as the conditional covariance in the affine constraint.\n", + "## Three matrices\n", "\n", - "This matrix might be easier to compute for both data splitting and general case (when we know $\\Sigma_{\\omega}$).\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In order to compute `_P` suppose wee have stored $PL^T(L^T\\Theta_{\\omega}L)^{-1}LP$ as well as " + "- All the computations above can be expressed of some target specific info like $B, \\Theta_{\\hat{\\theta}}, \\Sigma_{\\hat{\\theta}}, \\hat{\\theta}$ and\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "M_1 &= \\Sigma_S \\Theta_{\\omega} \\\\\n", + "M_2 &= M_1 \\Sigma_{\\omega} M_1^T \\\\\n", + "M_3 &= M_1 L (L^T\\Sigma_{\\omega}L)^{-1} L M_1^T\n", + "\\end{aligned}\n", + "$$" ] } ], diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 397fce7ef..d1fa9bf9e 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -941,9 +941,10 @@ def _setup_implied_gaussian(self, prod_score_prec = np.identity(self.nfeature) / ratio cov_rand = self._hessian * dispersion - M1 = prod_score_prec.dot(cov_rand).dot(prod_score_prec.T) - M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) - M3 = prod_score_prec + + M1 = prod_score_prec + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) return (cond_mean, cond_cov, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index dc59f6d9e..31909ac00 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -188,9 +188,9 @@ def _setup_implied_gaussian(self, cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - M1 = prod_score_prec.dot(cov_rand).dot(prod_score_prec.T) - M2 = prod_score_prec.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(prod_score_prec.T) - M3 = prod_score_prec + M1 = prod_score_prec + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) return (cond_mean, cond_cov, @@ -1346,7 +1346,7 @@ def selective_MLE(observed_target, linear_part, offset, opt_linear, - M1, + M1, M2, M3, observed_score, @@ -1393,38 +1393,21 @@ def selective_MLE(observed_target, prec_opt = np.linalg.inv(cond_cov) - # regress_opt_target determines how the conditional mean of optimization variables - # vary with target - # regress_opt determines how the argument of the optimization density - # depends on the score, not how the mean depends on score, hence the minus sign - - ## regress_score_target = cov_target_score.T.dot(prec_target) - ## resid_score_target = score_offset - regress_score_target.dot(observed_target) - - ## regress_opt_target = regress_opt.dot(regress_score_target) - ## resid_mean_opt_target = cond_mean - regress_opt_target.dot(observed_target) - - # M1, M2, M3 can be computed quickly (assumption) -- we can make this - # faster later - # shorthand - - # these could be done by the query at `fit` time - # this is specific to target T1 = regress_target_score.T.dot(prec_target) - T2 = T1.T.dot(M1.dot(T1)) - T3 = T1.T.dot(M2.dot(T1)) + T2 = T1.T.dot(M2.dot(T1)) + T3 = T1.T.dot(M3.dot(T1)) prec_target_nosel = prec_target + T2 - T3 - _P = T1.T.dot(M3.dot(observed_score)) - T2.dot(observed_target) + _P = T1.T.dot(M1.dot(observed_score)) - T2.dot(observed_target) - T4 = M3.T.dot(T1) + T4 = M1.T.dot(T1) T5 = opt_linear.T.dot(T4) T6 = cond_cov.dot(T5) T7 = opt_linear.dot(T6) - T8 = M3.dot(T7) - T9 = T8.dot(observed_target) + M3.dot(opt_linear.dot(cond_mean)) + T8 = M1.dot(T7) + T9 = T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean)) T10 = T1.T.dot(T9) C = cov_target.dot(T10) @@ -1442,7 +1425,7 @@ def selective_MLE(observed_target, offset, **solve_args) - T11 = regress_target_score.dot(M3.dot(opt_linear)) + T11 = regress_target_score.dot(M1.dot(opt_linear)) final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ + T11.dot(cond_mean - soln) + C From a762a480bf93755d8e5ec42f02a4d4f6ca19a37a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 18 Jul 2021 22:41:37 -0400 Subject: [PATCH 111/187] commit changes so far --- selectinf/randomized/exact_reference.py | 1 + .../randomized/tests/test_exact_reference.py | 115 ++++++++++++------ 2 files changed, 76 insertions(+), 40 deletions(-) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 018d19074..429278d1e 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -76,6 +76,7 @@ def __init__(self, self.opt_linear = query.opt_linear self.useIP = useIP + self.inverse_info = inverse_info def summary(self, alternatives=None, diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 7cb49ff11..7c1a5efb4 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -4,15 +4,16 @@ from ..lasso import lasso, selected_targets from ..exact_reference import exact_grid_inference -def test_approx_pivot(n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - equicorrelated=False, - useIP=False): +def test_inf(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1., + equicorrelated=False, + useIP=False, + CI=True): while True: @@ -66,37 +67,71 @@ def test_approx_pivot(n=500, cov_target_score, useIP=useIP) - pivot = exact_grid_inf._pivots(beta_target) + if CI is False: + pivot = exact_grid_inf._pivots(beta_target) + return pivot + + else: + lci, uci = exact_grid_inf._intervals(level=0.90) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + mle_length = 1.65*2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) + return np.mean(coverage), np.mean(length), np.mean(mle_length) + +def main(nsim=300, CI = False): + + if CI is False: + + import matplotlib as mpl + mpl.use('tkagg') + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF + + _pivot = [] + for i in range(nsim): + _pivot.extend(test_inf(n=100, + p=400, + signal_fac=1., + s=0, + sigma=2., + rho=0.30, + randomizer_scale=0.7, + equicorrelated=True, + useIP=False, + CI=False)) + + print("iteration completed ", i) + + plt.clf() + ecdf_pivot = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_pivot(grid), c='blue') + plt.plot(grid, grid, 'k--') + plt.show() + + else: + coverage_ = 0. + length_ = 0. + mle_length_= 0. + for n in range(nsim): + cov, len, mle_len = test_inf(n=400, + p=100, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.30, + randomizer_scale=0.7, + equicorrelated=True, + useIP=False, + CI=True) + + coverage_ += cov + length_ += len + mle_length_ += mle_len + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.), mle_length_/ (n + 1.)) + print("iteration completed ", n + 1) - return pivot - -def main(nsim=300): - - import matplotlib as mpl - mpl.use('tkagg') - import matplotlib.pyplot as plt - from statsmodels.distributions.empirical_distribution import ECDF - - _pivot = [] - for i in range(nsim): - _pivot.extend(test_approx_pivot(n=100, - p=400, - signal_fac=1., - s=0, - sigma=2., - rho=0.30, - randomizer_scale=0.7, - equicorrelated=True, - useIP=False)) - - print("iteration completed ", i) - - plt.clf() - ecdf_pivot = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_pivot(grid), c='blue') - plt.plot(grid, grid, 'k--') - plt.show() if __name__ == "__main__": - main(nsim=100) \ No newline at end of file + main(nsim=100, CI=True) \ No newline at end of file From 7856e7ac19db87b6d24e8415fa149299472e7af4 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 19 Jul 2021 10:42:19 -0400 Subject: [PATCH 112/187] commit before switch --- selectinf/randomized/lasso.py | 2 +- selectinf/randomized/tests/test_split_lasso.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index d1fa9bf9e..53d5ff1fc 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -942,7 +942,7 @@ def _setup_implied_gaussian(self, cov_rand = self._hessian * dispersion - M1 = prod_score_prec + M1 = prod_score_prec M2 = M1.dot(cov_rand).dot(M1.T) M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) diff --git a/selectinf/randomized/tests/test_split_lasso.py b/selectinf/randomized/tests/test_split_lasso.py index 78df932a9..f994c05cc 100644 --- a/selectinf/randomized/tests/test_split_lasso.py +++ b/selectinf/randomized/tests/test_split_lasso.py @@ -163,4 +163,3 @@ def main(nsim=500, n=100, p=200, target='selected', sigma=3, s=3): plt.savefig("plot.pdf") plt.show() - From 73137f81f68d1770d1ac2fc4954cc91dd40ab1bc Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 19 Jul 2021 16:25:31 -0400 Subject: [PATCH 113/187] some sign fixes --- selectinf/randomized/query.py | 8 +- .../tests/test_selective_MLE_high.py | 586 ++---------------- 2 files changed, 67 insertions(+), 527 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 31909ac00..fa4560ffb 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1400,16 +1400,18 @@ def selective_MLE(observed_target, T3 = T1.T.dot(M3.dot(T1)) prec_target_nosel = prec_target + T2 - T3 - _P = T1.T.dot(M1.dot(observed_score)) - T2.dot(observed_target) + _P = -(T1.T.dot(M1.dot(observed_score)) + T2.dot(observed_target)) ##flipped sign of second term here T4 = M1.T.dot(T1) T5 = opt_linear.T.dot(T4) T6 = cond_cov.dot(T5) T7 = opt_linear.dot(T6) T8 = M1.dot(T7) - T9 = T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean)) + T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) + #T9 = M1.dot(opt_linear.dot(cond_mean)) T10 = T1.T.dot(T9) - C = cov_target.dot(T10) + C = cov_target.dot(_P - T10) + print("check within MLE ", np.allclose(T2 - T3, np.zeros((T2.shape[0], T2.shape[1]))), np.allclose((_P-T10), np.zeros(T10.shape[0]))) conjugate_arg = prec_opt.dot(cond_mean) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index da592da87..1df9b2930 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -24,7 +24,7 @@ def test_full_targets(n=200, randomizer_scale=0.7, full_dispersion=False): """ - Run approx MLE with full targets on Gaussian data + Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.gaussian @@ -86,30 +86,31 @@ def test_full_targets(n=200, cov_target_score)[0] pval = result['pvalue'] estimate = result['MLE'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) print("estimate, intervals", estimate, intervals) coverage = (beta[nonzero] > intervals[:, 0]) * (beta[nonzero] < intervals[:, 1]) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def test_selected_targets(n=2000, - p=200, - signal_fac=10., - s=5, - sigma=3, - rho=0.4, - randomizer_scale=1, + +def test_selected_targets(seedn, + n=2000, + p=200, + signal_fac=1.2, + s=5, + sigma=2, + rho=0.7, + randomizer_scale=1., full_dispersion=True): """ - Run approx MLE with selected targets on Gaussian data + Compare to R randomized lasso """ inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) while True: + np.random.seed(seed=seedn) X, Y, beta = inst(n=n, p=p, signal=signal, @@ -156,522 +157,17 @@ def test_selected_targets(n=2000, cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals - -def test_logistic(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on binomial data - """ - - inst, const = logistic_instance, lasso.logistic - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * sigma_) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_logistic_split(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on binomial data with data splitting - """ - - inst, const = logistic_instance, split_lasso.logistic - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - proportion=0.7) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_poisson(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on Poisson data - """ - - inst, const = poisson_instance, lasso.poisson - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * sigma_) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_poisson_split(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on Poisson data with data splitting - """ - - inst, const = poisson_instance, split_lasso.poisson - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - proportion=0.7) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_cox(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on survival data - """ - - inst, const = cox_instance, lasso.coxph - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, T, S, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:4] - - n, p = X.shape - - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) - - conv = const(X, - T, - S, - W, - randomizer_scale=randomizer_scale) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - cox_full = rr.glm.cox(X, T, S) - full_hess = cox_full.hessian(conv.observed_soln) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - None, - nonzero, - hessian=full_hess, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_cox_split(n=2000, - p=200, - signal_fac=10., - s=5, - rho=0.4, - randomizer_scale=1): - """ - Run approx MLE with selected targets on survival data with data splitting - """ - - inst, const = cox_instance, split_lasso.coxph - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - while True: - X, T, S, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - random_signs=True)[:4] - - n, p = X.shape - - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) - - conv = const(X, - T, - S, - W, - proportion=0.7) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - - cox_full = rr.glm.cox(X, T, S) - full_hess = cox_full.hessian(conv.observed_soln) - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - None, - nonzero, - hessian=full_hess, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - -def test_scale_invariant_split(n=200, - p=20, - signal_fac=10., - s=5, - sigma=3, - rho=0.4, - randomizer_scale=1, - full_dispersion=True, - seed=2): - """ - Confirm Gaussian version is appropriately scale invariant with data splitting - """ - - inst, const = gaussian_instance, split_lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - results = [] - - scales = [1, 5] - for scale in scales: - - np.random.seed(seed) - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - Y *= scale; beta *= scale - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - print('W', W[0]/scale) - conv = const(X, - Y, - W, - proportion=0.7) - - signs = conv.fit() - nonzero = signs != 0 - print('nonzero', np.where(nonzero)[0]) - print('feature_weights', conv.feature_weights[0] / scale) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - print('dispersion', dispersion/scale**2) - print('target', observed_target[0]/scale) - print('cov_target', cov_target[0,0]/scale**2) - print('cov_target_score', cov_target_score[0,0]/scale**2) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - - print(result['MLE'] / scale) - results.append(result) - - assert np.allclose(results[0]['MLE'] / scales[0], - results[1]['MLE'] / scales[1]) - assert np.allclose(results[0]['SE'] / scales[0], - results[1]['SE'] / scales[1]) - assert np.allclose(results[0]['upper_confidence'] / scales[0], - results[1]['upper_confidence'] / scales[1]) - assert np.allclose(results[0]['lower_confidence'] / scales[0], - results[1]['lower_confidence'] / scales[1]) - assert np.allclose(results[0]['Zvalue'], - results[1]['Zvalue']) - assert np.allclose(results[0]['pvalue'], - results[1]['pvalue']) - -def test_scale_invariant(n=200, - p=20, - signal_fac=10., - s=5, - sigma=3, - rho=0.4, - randomizer_scale=1, - full_dispersion=True, - seed=2): - """ - Confirm Gaussian version is appropriately scale invariant - """ - - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - results = [] - - scales = [1, 5] - for scale in scales: - - np.random.seed(seed) - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] + # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) - Y *= scale; beta *= scale - n, p = X.shape - - sigma_ = np.std(Y) - W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - print('W', W[0]/scale) - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * sigma_) + #return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals + return result['MLE'], result['lower_confidence'], result['upper_confidence'] - signs = conv.fit() - nonzero = signs != 0 - print('nonzero', np.where(nonzero)[0]) - print('feature_weights', conv.feature_weights[0] / scale) - print('perturb', conv._initial_omega[0] / scale) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - print('dispersion', dispersion/scale**2) - print('target', observed_target[0]/scale) - print('cov_target', cov_target[0,0]/scale**2) - print('cov_target_score', cov_target_score[0,0]/scale**2) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - - print(result['MLE'] / scale) - results.append(result) - - assert np.allclose(results[0]['MLE'] / scales[0], - results[1]['MLE'] / scales[1]) - assert np.allclose(results[0]['SE'] / scales[0], - results[1]['SE'] / scales[1]) - assert np.allclose(results[0]['upper_confidence'] / scales[0], - results[1]['upper_confidence'] / scales[1]) - assert np.allclose(results[0]['lower_confidence'] / scales[0], - results[1]['lower_confidence'] / scales[1]) - assert np.allclose(results[0]['Zvalue'], - results[1]['Zvalue']) - assert np.allclose(results[0]['pvalue'], - results[1]['pvalue']) - def test_instance(): n, p, s = 500, 100, 5 @@ -705,8 +201,7 @@ def test_instance(): cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) @@ -716,6 +211,17 @@ def test_instance(): return coverage + +# def main(nsim=500): +# +# cover = [] +# for i in range(nsim): +# +# cover_ = test_instance() +# cover.extend(cover_) +# print(np.mean(cover), 'coverage so far ') + + def test_selected_targets_disperse(n=500, p=100, signal_fac=1., @@ -786,7 +292,7 @@ def test_selected_targets_disperse(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def main(nsim=500, full=False): +def test_inf(nsim=500, full=False): P0, PA, cover, length_int = [], [], [], [] from statsmodels.distributions import ECDF @@ -808,9 +314,41 @@ def main(nsim=500, full=False): cover.extend(cover_) P0.extend(p0) PA.extend(pA) + # print( + # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), + # np.mean(avg_length), 'null pvalue + power + length') print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) +def main(nsim =50): + + import pandas as pd + column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf"] + master_DF = pd.DataFrame(columns=column_names) + DF = pd.DataFrame(columns=column_names) + + n, p, s = 500, 100, 5 + for i in range(nsim): + full_dispersion = True + mle, lower_conf, upper_conf = test_selected_targets(n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion, seedn=i) + #print("check ", mle, lower_conf, upper_conf) + DF["MLE"] = pd.Series(mle) + DF["Lower Conf"] = pd.Series(lower_conf) + DF["Upper Conf"] = pd.Series(upper_conf) + DF["Experiment Replicate"] = pd.Series((i*np.ones(len(mle),int)).tolist()) + + master_DF = DF.append(master_DF, ignore_index=True) + + import os + outpath = os.path.dirname(__file__) + + outfile_mse_html = os.path.join(outpath, "compare_mle.html") + outfile_mse_csv = os.path.join(outpath, "compare_mle.csv") + + master_DF.to_html(outfile_mse_html, index=False) + master_DF.to_csv(outfile_mse_csv, index=False) + if __name__ == "__main__": main(nsim=50) + From 5449179e1605071abd145e3c3a763139fb59e806 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 19 Jul 2021 23:19:05 -0400 Subject: [PATCH 114/187] commit before switch --- selectinf/randomized/query.py | 6 +- .../tests/test_selective_MLE_high.py | 66 ++++++++++--------- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index fa4560ffb..8afe3e5aa 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1408,10 +1408,9 @@ def selective_MLE(observed_target, T7 = opt_linear.dot(T6) T8 = M1.dot(T7) T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) - #T9 = M1.dot(opt_linear.dot(cond_mean)) T10 = T1.T.dot(T9) C = cov_target.dot(_P - T10) - print("check within MLE ", np.allclose(T2 - T3, np.zeros((T2.shape[0], T2.shape[1]))), np.allclose((_P-T10), np.zeros(T10.shape[0]))) + print("check within MLE ", np.allclose(T2 - T3, np.zeros((T2.shape[0], T2.shape[1]))), np.allclose(C, np.zeros(C.shape[0]))) conjugate_arg = prec_opt.dot(cond_mean) @@ -1463,7 +1462,8 @@ def selective_MLE(observed_target, 'lower_confidence': intervals[:, 0], 'upper_confidence': intervals[:, 1], 'unbiased': unbiased_estimator}) - return result, observed_info_mean, log_ref + return result, observed_info_mean, log_ref,\ + T11[:,0], cond_mean - soln, cov_target.dot(prec_target_nosel).dot(observed_target), C diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 1df9b2930..143ac5c1a 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -152,9 +152,9 @@ def test_selected_targets(seedn, nonzero, dispersion=dispersion) - result = conv.selective_MLE(observed_target, + result, _, _, X1, X2, X3, X4 = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[0] + cov_target_score) estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -166,7 +166,7 @@ def test_selected_targets(seedn, # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) #return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals - return result['MLE'], result['lower_confidence'], result['upper_confidence'] + return result['MLE'], result['lower_confidence'], result['upper_confidence'], X1, X2, X3, X4 def test_instance(): @@ -292,50 +292,54 @@ def test_selected_targets_disperse(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def test_inf(nsim=500, full=False): - P0, PA, cover, length_int = [], [], [], [] - from statsmodels.distributions import ECDF - - n, p, s = 500, 100, 0 - - for i in range(nsim): - if full: - if n > p: - full_dispersion = True - else: - full_dispersion = False - p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - else: - full_dispersion = True - p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - # print( - # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), - # np.mean(avg_length), 'null pvalue + power + length') - print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) +# def main(nsim=500, full=False): +# P0, PA, cover, length_int = [], [], [], [] +# from statsmodels.distributions import ECDF +# +# n, p, s = 500, 100, 0 +# +# for i in range(nsim): +# if full: +# if n > p: +# full_dispersion = True +# else: +# full_dispersion = False +# p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) +# avg_length = intervals[:, 1] - intervals[:, 0] +# else: +# full_dispersion = True +# p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) +# avg_length = intervals[:, 1] - intervals[:, 0] +# +# cover.extend(cover_) +# P0.extend(p0) +# PA.extend(pA) +# # print( +# # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), +# # np.mean(avg_length), 'null pvalue + power + length') +# print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) def main(nsim =50): import pandas as pd - column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf"] + column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf", "X1", "X2", "X3", "X4"] master_DF = pd.DataFrame(columns=column_names) DF = pd.DataFrame(columns=column_names) n, p, s = 500, 100, 5 for i in range(nsim): full_dispersion = True - mle, lower_conf, upper_conf = test_selected_targets(n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion, seedn=i) + mle, lower_conf, upper_conf, X1, X2, X3, X4 = test_selected_targets(seedn=i, n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion) #print("check ", mle, lower_conf, upper_conf) DF["MLE"] = pd.Series(mle) DF["Lower Conf"] = pd.Series(lower_conf) DF["Upper Conf"] = pd.Series(upper_conf) DF["Experiment Replicate"] = pd.Series((i*np.ones(len(mle),int)).tolist()) + DF["X1"] = pd.Series(X1) + DF["X2"] = pd.Series(X2) + DF["X3"] = pd.Series(X3) + DF["X4"] = pd.Series(X4) master_DF = DF.append(master_DF, ignore_index=True) From cce962e70bf406b156e55b8d3048aed8b51bc185 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 20 Jul 2021 09:00:22 -0400 Subject: [PATCH 115/187] regress_target_score scaled by dispersion --- selectinf/randomized/lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 53d5ff1fc..bbb9cb7c2 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -738,7 +738,7 @@ def selected_targets(loglike, regress_target_score = np.zeros((cov_target.shape[0], p)) regress_target_score[:,features] = cov_target - return observed_target, cov_target * dispersion, regress_target_score, alternatives + return observed_target, cov_target * dispersion, regress_target_score * dispersion, alternatives def full_targets(loglike, W, From cb73217c7d9a876909abe9865e481c46e3659c0e Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 20 Jul 2021 09:36:23 -0400 Subject: [PATCH 116/187] changes to selective mle + target: added comments --- selectinf/randomized/lasso.py | 2 +- selectinf/randomized/query.py | 11 ++- .../tests/test_selective_MLE_high.py | 87 +++++++++++++++---- 3 files changed, 77 insertions(+), 23 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index bbb9cb7c2..f5b27936e 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -737,7 +737,7 @@ def selected_targets(loglike, Xfeat.dot(observed_target))) ** 2 / W).sum() / (n - Xfeat.shape[1]) regress_target_score = np.zeros((cov_target.shape[0], p)) - regress_target_score[:,features] = cov_target + regress_target_score[:,features] = cov_target ##scale by dispersion while returning the value return observed_target, cov_target * dispersion, regress_target_score * dispersion, alternatives def full_targets(loglike, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 8afe3e5aa..43306aa1c 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -166,7 +166,7 @@ def _setup_implied_gaussian(self, dispersion=1): cov_rand, prec = self.randomizer.cov_prec - prec = prec / dispersion # why do we do this here -- prec is just known + prec = prec if np.asarray(prec).shape in [(), (0,)]: _prod_score_prec_unnorm = self._hessian * prec @@ -1407,10 +1407,9 @@ def selective_MLE(observed_target, T6 = cond_cov.dot(T5) T7 = opt_linear.dot(T6) T8 = M1.dot(T7) - T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) + T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) ##flipped sign of first term here T10 = T1.T.dot(T9) - C = cov_target.dot(_P - T10) - print("check within MLE ", np.allclose(T2 - T3, np.zeros((T2.shape[0], T2.shape[1]))), np.allclose(C, np.zeros(C.shape[0]))) + C = cov_target.dot(_P - T10) ##added missing _P in computing C conjugate_arg = prec_opt.dot(cond_mean) @@ -1462,8 +1461,8 @@ def selective_MLE(observed_target, 'lower_confidence': intervals[:, 0], 'upper_confidence': intervals[:, 1], 'unbiased': unbiased_estimator}) - return result, observed_info_mean, log_ref,\ - T11[:,0], cond_mean - soln, cov_target.dot(prec_target_nosel).dot(observed_target), C + + return result, observed_info_mean, log_ref diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 143ac5c1a..b53340b9c 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -93,8 +93,7 @@ def test_full_targets(n=200, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def test_selected_targets(seedn, - n=2000, +def test_selected_targets(n=2000, p=200, signal_fac=1.2, s=5, @@ -110,7 +109,6 @@ def test_selected_targets(seedn, signal = np.sqrt(signal_fac * 2 * np.log(p)) while True: - np.random.seed(seed=seedn) X, Y, beta = inst(n=n, p=p, signal=signal, @@ -152,9 +150,9 @@ def test_selected_targets(seedn, nonzero, dispersion=dispersion) - result, _, _, X1, X2, X3, X4 = conv.selective_MLE(observed_target, + result = conv.selective_MLE(observed_target, cov_target, - cov_target_score) + cov_target_score)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -163,10 +161,7 @@ def test_selected_targets(seedn, coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) - - #return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals - return result['MLE'], result['lower_confidence'], result['upper_confidence'], X1, X2, X3, X4 + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals def test_instance(): @@ -320,26 +315,86 @@ def test_selected_targets_disperse(n=500, # print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) +def test_selected_instance(seedn, + n=2000, + p=200, + signal_fac=1.2, + s=5, + sigma=2, + rho=0.7, + randomizer_scale=1., + full_dispersion=True): + """ + Compare to R randomized lasso + """ + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + np.random.seed(seed=seedn) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=True, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + idx = np.arange(p) + sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) + print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) + + n, p = X.shape + + sigma_ = np.std(Y) + W = 0.8 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + ridge_term=0., + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + dispersion = None + if full_dispersion: + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + return result['MLE'], result['lower_confidence'], result['upper_confidence'] + def main(nsim =50): import pandas as pd - column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf", "X1", "X2", "X3", "X4"] + column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf"] master_DF = pd.DataFrame(columns=column_names) DF = pd.DataFrame(columns=column_names) n, p, s = 500, 100, 5 for i in range(nsim): full_dispersion = True - mle, lower_conf, upper_conf, X1, X2, X3, X4 = test_selected_targets(seedn=i, n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion) - #print("check ", mle, lower_conf, upper_conf) + mle, lower_conf, upper_conf = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion) DF["MLE"] = pd.Series(mle) DF["Lower Conf"] = pd.Series(lower_conf) DF["Upper Conf"] = pd.Series(upper_conf) DF["Experiment Replicate"] = pd.Series((i*np.ones(len(mle),int)).tolist()) - DF["X1"] = pd.Series(X1) - DF["X2"] = pd.Series(X2) - DF["X3"] = pd.Series(X3) - DF["X4"] = pd.Series(X4) master_DF = DF.append(master_DF, ignore_index=True) From e99d75eb185b58d741e9f61d23f510858c878a7b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 25 Jul 2021 16:13:30 -0400 Subject: [PATCH 117/187] updated query: moved calculations for M1, M2, M3 --- selectinf/randomized/lasso.py | 8 +-- selectinf/randomized/query.py | 71 +++++++++---------- .../tests/test_selective_MLE_high.py | 51 +++++++++---- 3 files changed, 73 insertions(+), 57 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index f5b27936e..9917d229c 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -737,8 +737,8 @@ def selected_targets(loglike, Xfeat.dot(observed_target))) ** 2 / W).sum() / (n - Xfeat.shape[1]) regress_target_score = np.zeros((cov_target.shape[0], p)) - regress_target_score[:,features] = cov_target ##scale by dispersion while returning the value - return observed_target, cov_target * dispersion, regress_target_score * dispersion, alternatives + regress_target_score[:,features] = cov_target + return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives def full_targets(loglike, W, @@ -774,7 +774,7 @@ def full_targets(loglike, alternatives = ['twosided'] * features.sum() regress_target_score = Qfull_inv[features] # weights missing? - return observed_target, cov_target * dispersion, regress_target_score, alternatives + return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives def debiased_targets(loglike, W, @@ -829,7 +829,7 @@ def debiased_targets(loglike, (n - features.sum())) alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, Qinv_hat, alternatives + return observed_target, cov_target * dispersion, Qinv_hat, dispersion, alternatives def form_targets(target, loglike, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 43306aa1c..e17be21a8 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -106,24 +106,22 @@ def _setup_sampler(self, linear_part, offset, opt_linear, - observed_subgrad, - # optional dispersion parameter - # for covariance of randomization - dispersion=1): + observed_subgrad): A, b = linear_part, offset if not np.all(A.dot(self.observed_opt_state) - b <= 0): raise ValueError('constraints not satisfied') + cov_rand, prec = self.randomizer.cov_prec + (cond_mean, cond_cov, cond_precision, regress_opt, - M1, - M2, - M3) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion) + prod_score_prec_unnorm) = self._setup_implied_gaussian(opt_linear, + observed_subgrad, + cov_rand, + prec) def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: @@ -149,31 +147,25 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad self.observed_opt_state, self.observed_score_state, log_density, - regress_opt, # not needed? + regress_opt, observed_subgrad, opt_linear, # L - M1, - M2, - M3, + prod_score_prec_unnorm, + cov_rand, selection_info=self.selection_variable, useC=self.useC) def _setup_implied_gaussian(self, opt_linear, observed_subgrad, - # optional dispersion parameter - # for covariance of randomization - dispersion=1): + cov_rand, + prec): - cov_rand, prec = self.randomizer.cov_prec - prec = prec if np.asarray(prec).shape in [(), (0,)]: - _prod_score_prec_unnorm = self._hessian * prec + prod_score_prec_unnorm = self._hessian * prec else: - _prod_score_prec_unnorm = self._hessian.dot(prec) - - prod_score_prec = _prod_score_prec_unnorm * dispersion + prod_score_prec_unnorm = self._hessian.dot(prec) if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec @@ -187,23 +179,18 @@ def _setup_implied_gaussian(self, # regress_opt is regression coefficient of opt onto score + u... cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - - M1 = prod_score_prec - M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) return (cond_mean, cond_cov, cond_precision, regress_opt, - M1, - M2, - M3) + prod_score_prec_unnorm) def summary(self, observed_target, cov_target, regress_target_score, + dispersion, alternatives, opt_sample=None, target_sample=None, @@ -303,6 +290,7 @@ def selective_MLE(self, observed_target, cov_target, regress_target_score, + dispersion, level=0.9, solve_args={'tol': 1.e-12}): """ @@ -323,6 +311,7 @@ def selective_MLE(self, return self.sampler.selective_MLE(observed_target, cov_target, regress_target_score, + dispersion, self.observed_opt_state, level=level, solve_args=solve_args) @@ -858,10 +847,9 @@ def __init__(self, log_cond_density, regress_opt, observed_subgrad, - opt_linear, - M1, - M2, - M3, + opt_linear, + prod_score_prec_unnorm, + cov_rand, selection_info=None, useC=False): @@ -905,7 +893,9 @@ def __init__(self, self.observed_subgrad = observed_subgrad self.useC = useC self.opt_linear = opt_linear - self.M1, self.M2, self.M3 = M1, M2, M3 + + self.prod_score_prec_unnorm = prod_score_prec_unnorm + self.cov_rand = cov_rand def log_cond_density(self, opt_sample, @@ -954,6 +944,7 @@ def selective_MLE(self, observed_target, cov_target, regress_target_score, + dispersion, # initial (observed) value of optimization variables -- # used as a feasible point. # precise value used only for independent estimator @@ -979,6 +970,11 @@ def selective_MLE(self, Arguments passed to solver. """ + prod_score_prec = self.prod_score_prec_unnorm * dispersion + M1 = prod_score_prec + M2 = M1.dot(self.cov_rand).dot(M1.T) + M3 = M1.dot(self.opt_linear.dot(self.covariance).dot(self.opt_linear.T)).dot(M1.T) + return selective_MLE(observed_target, cov_target, regress_target_score, @@ -988,9 +984,9 @@ def selective_MLE(self, self.affine_con.linear_part, self.affine_con.offset, self.opt_linear, - self.M1, - self.M2, - self.M3, + M1, + M2, + M3, self.observed_score_state + self.observed_subgrad, solve_args=solve_args, level=level, @@ -1410,7 +1406,6 @@ def selective_MLE(observed_target, T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) ##flipped sign of first term here T10 = T1.T.dot(T9) C = cov_target.dot(_P - T10) ##added missing _P in computing C - conjugate_arg = prec_opt.dot(cond_mean) if useC: diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index b53340b9c..cd8cc81ab 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -315,15 +315,16 @@ def test_selected_targets_disperse(n=500, # print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) -def test_selected_instance(seedn, - n=2000, - p=200, - signal_fac=1.2, - s=5, - sigma=2, - rho=0.7, - randomizer_scale=1., - full_dispersion=True): +def test_mle_inference(seedn, + n=2000, + p=200, + signal_fac=1.2, + s=5, + sigma=2, + rho=0.7, + randomizer_scale=1., + full_dispersion=True, + full=False): """ Compare to R randomized lasso """ @@ -366,17 +367,30 @@ def test_selected_instance(seedn, if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, + if full: + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = full_targets(conv.loglike, conv._W, nonzero, dispersion=dispersion) + else: + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + result = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[0] + cov_target_score, + dispersion)[0] return result['MLE'], result['lower_confidence'], result['upper_confidence'] @@ -390,7 +404,14 @@ def main(nsim =50): n, p, s = 500, 100, 5 for i in range(nsim): full_dispersion = True - mle, lower_conf, upper_conf = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2, full_dispersion=full_dispersion) + mle, lower_conf, upper_conf = test_mle_inference(seedn=i, + n=n, + p=p, + s=s, + signal_fac=1.2, + full_dispersion=full_dispersion, + full=True) + DF["MLE"] = pd.Series(mle) DF["Lower Conf"] = pd.Series(lower_conf) DF["Upper Conf"] = pd.Series(upper_conf) From 071a143d5f5d6a2dfa678768b3e28a123b7f7cae Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 25 Jul 2021 16:26:55 -0400 Subject: [PATCH 118/187] commit before switch --- selectinf/randomized/tests/test_selective_MLE_high.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index cd8cc81ab..5589212d8 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -355,7 +355,7 @@ def test_mle_inference(seedn, conv = const(X, Y, W, - ridge_term=0., + #ridge_term=0., randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() @@ -410,7 +410,7 @@ def main(nsim =50): s=s, signal_fac=1.2, full_dispersion=full_dispersion, - full=True) + full=False) DF["MLE"] = pd.Series(mle) DF["Lower Conf"] = pd.Series(lower_conf) From 66a388e8735a4591c76812d0d77e7a07717d790a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 25 Jul 2021 16:45:29 -0400 Subject: [PATCH 119/187] commit before switch --- selectinf/randomized/tests/test_selective_MLE_high.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 5589212d8..e60e7feb7 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -345,6 +345,7 @@ def test_mle_inference(seedn, idx = np.arange(p) sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) + snr = beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n) print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) n, p = X.shape @@ -392,19 +393,19 @@ def test_mle_inference(seedn, cov_target_score, dispersion)[0] - return result['MLE'], result['lower_confidence'], result['upper_confidence'] + return result['MLE'], result['lower_confidence'], result['upper_confidence'], snr def main(nsim =50): import pandas as pd - column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf"] + column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf", "SNR"] master_DF = pd.DataFrame(columns=column_names) DF = pd.DataFrame(columns=column_names) n, p, s = 500, 100, 5 for i in range(nsim): full_dispersion = True - mle, lower_conf, upper_conf = test_mle_inference(seedn=i, + mle, lower_conf, upper_conf, snr = test_mle_inference(seedn=i, n=n, p=p, s=s, @@ -416,6 +417,7 @@ def main(nsim =50): DF["Lower Conf"] = pd.Series(lower_conf) DF["Upper Conf"] = pd.Series(upper_conf) DF["Experiment Replicate"] = pd.Series((i*np.ones(len(mle),int)).tolist()) + DF["SNR"] = pd.Series((snr * np.ones(len(mle))).tolist()) master_DF = DF.append(master_DF, ignore_index=True) From 197e927202ebb72174238930793f108f545d59fc Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 26 Jul 2021 11:45:51 -0400 Subject: [PATCH 120/187] update posterior inf --- selectinf/randomized/posterior_inference.py | 73 +++++++++++--------- selectinf/randomized/tests/test_posterior.py | 7 +- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 44f981561..96bb86796 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -31,39 +31,50 @@ def __init__(self, query, observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, prior, - dispersion=1, solve_args={'tol': 1.e-12}): self.solve_args = solve_args linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset - regress_opt = query.sampler.logdens_transform[0] - _, prec_randomizer = query.randomizer.cov_prec - score_offset = query.observed_score_state + query.sampler.logdens_transform[1] + opt_linear = query.opt_linear + + observed_score = query.observed_score_state + query.observed_subgrad + + print("dispersion ", dispersion) result, self.inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, - cov_target_score) + regress_target_score, + dispersion) ### Note for an informative prior we might want to change this... - self.ntarget = cov_target.shape[0] - self.nopt = query.cond_cov.shape[0] - self.cond_precision = np.linalg.inv(query.cond_cov) self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) + self.ntarget = self.cov_target.shape[0] + self.nopt = self.cond_precision.shape[0] + self.observed_target = observed_target - self.cov_target_score = cov_target_score - self.regress_opt = regress_opt - self.prec_randomizer = prec_randomizer - self.score_offset = score_offset + self.regress_target_score = regress_target_score + self.opt_linear = opt_linear + self.observed_score = observed_score + + prod_score_prec = query.prod_score_prec_unnorm * dispersion + M1 = prod_score_prec + M2 = M1.dot(query.cov_rand).dot(M1.T) + M3 = M1.dot(self.opt_linear.dot(query.cond_cov).dot(self.opt_linear.T)).dot(M1.T) + + self.M1 = M1 + self.M2 = M2 + self.M3 = M3 - self.feasible_point = query.observed_opt_state + self.feasible_point = query.initial_point self.cond_mean = query.cond_mean self.linear_part = linear_part self.offset = offset @@ -131,29 +142,27 @@ def _set_marginal_parameters(self): implied mean as a function of the true parameters. """ - regress_score_target = self.cov_target_score.T.dot(self.prec_target) - resid_score_target = self.score_offset - regress_score_target.dot(self.observed_target) + T1 = self.regress_target_score.T.dot(self.prec_target) + T2 = T1.T.dot(self.M2.dot(T1)) + T3 = T1.T.dot(self.M3.dot(T1)) - regress_opt_target = self.regress_opt.dot(regress_score_target) - resid_mean_opt_target = self.cond_mean - regress_opt_target.dot(self.observed_target) + prec_target_nosel = self.prec_target + T2 - T3 + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(self.observed_target)) - self.linear_coef = regress_opt_target - self.offset_coef = resid_mean_opt_target + _Q = np.linalg.inv(prec_target_nosel + T3) - if np.asarray(self.prec_randomizer).shape in [(), (0,)]: - prec_target_nosel = self.prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) \ - - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) - _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer - else: - prec_target_nosel = self.prec_target + (regress_score_target.T.dot(self.prec_randomizer).dot(regress_score_target)) \ - - regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target) - _P = regress_score_target.T.dot(self.prec_randomizer).dot(resid_score_target) + T4 = self.M1.T.dot(T1) + T5 = self.opt_linear.T.dot(T4) + T6 = self.cond_cov.dot(T5) + T7 = self.opt_linear.dot(T6) + T8 = self.M1.dot(T7) + T9 = (-T8.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) ##flipped sign of first term here + T10 = T1.T.dot(T9) - _Q = np.linalg.inv(_prec + regress_opt_target.T.dot(self.cond_precision).dot(regress_opt_target)) - self.prec_marginal = self.cond_precision - self.cond_precision.dot(regress_opt_target).dot(_Q).dot(regress_opt_target.T).dot(self.cond_precision) + self.prec_marginal = self.cond_precision - T5.dot(_Q).dot(T5) - r = np.linalg.inv(_prec).dot(regress_opt_target.T.dot(self.cond_precision).dot(resid_mean_opt_target) - _P) - S = np.linalg.inv(_prec).dot(self.prec_target) + r = np.linalg.inv(prec_target_nosel).dot(T10 - _P) + S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) self.r = r self.S = S diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 1b369c351..fd3e3a803 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -49,7 +49,8 @@ def test_Langevin(n=500, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -57,7 +58,7 @@ def test_Langevin(n=500, posterior_inf = conv.posterior(observed_target, cov_target, - cov_target_score, + regress_target_score, dispersion=dispersion) samples = langevin_sampler(posterior_inf, @@ -362,6 +363,6 @@ def test_hiv_data(nsample=10000, if __name__ == "__main__": # test_hiv_data(split_proportion=0.50) - test_coverage(nsim=100) + test_coverage(nsim=1) From 275c953b314c35f395ac4adc84e9e7ccf0c4da9c Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 26 Jul 2021 11:46:44 -0400 Subject: [PATCH 121/187] scaled M1, M2, M3 with dispersion --- selectinf/randomized/query.py | 74 ++++++++++--------- .../tests/test_selective_MLE_high.py | 2 +- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index e17be21a8..d1c0b1077 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -106,24 +106,25 @@ def _setup_sampler(self, linear_part, offset, opt_linear, - observed_subgrad): + observed_subgrad, + # optional dispersion parameter + # for covariance of randomization + dispersion=1): A, b = linear_part, offset if not np.all(A.dot(self.observed_opt_state) - b <= 0): raise ValueError('constraints not satisfied') - cov_rand, prec = self.randomizer.cov_prec - (cond_mean, cond_cov, cond_precision, regress_opt, - prod_score_prec_unnorm) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - cov_rand, - prec) + M1, + M2, + M3) = self._setup_implied_gaussian(opt_linear, + observed_subgrad) - def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad + def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: mean_term = regress_opt.dot(score.T + u).T else: @@ -147,26 +148,29 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad self.observed_opt_state, self.observed_score_state, log_density, - regress_opt, + regress_opt, # not needed? observed_subgrad, - opt_linear, # L - prod_score_prec_unnorm, - cov_rand, + opt_linear, # L + M1, + M2, + M3, selection_info=self.selection_variable, useC=self.useC) def _setup_implied_gaussian(self, opt_linear, - observed_subgrad, - cov_rand, - prec): + observed_subgrad): + cov_rand, prec = self.randomizer.cov_prec + prec = prec # why do we do this here -- prec is just known if np.asarray(prec).shape in [(), (0,)]: prod_score_prec_unnorm = self._hessian * prec else: prod_score_prec_unnorm = self._hessian.dot(prec) - + + prod_score_prec_unnorm + if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec cond_cov = np.linalg.inv(cond_precision) @@ -175,16 +179,22 @@ def _setup_implied_gaussian(self, cond_precision = opt_linear.T.dot(prec.dot(opt_linear)) cond_cov = np.linalg.inv(cond_precision) regress_opt = -cond_cov.dot(opt_linear.T).dot(prec) - + # regress_opt is regression coefficient of opt onto score + u... cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - + + M1 = prod_score_prec_unnorm + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + return (cond_mean, cond_cov, cond_precision, regress_opt, - prod_score_prec_unnorm) + M1, + M2, + M3) def summary(self, observed_target, @@ -320,8 +330,8 @@ def posterior(self, observed_target, cov_target, regress_target_score, + dispersion, prior=None, - dispersion=None, solve_args={'tol': 1.e-12}): """ Parameters @@ -358,8 +368,8 @@ def prior(target_parameter): observed_target, cov_target, regress_target_score, - prior, dispersion, + prior, solve_args=solve_args) def approximate_grid_inference(self, @@ -848,8 +858,9 @@ def __init__(self, regress_opt, observed_subgrad, opt_linear, - prod_score_prec_unnorm, - cov_rand, + M1, + M2, + M3, selection_info=None, useC=False): @@ -893,9 +904,7 @@ def __init__(self, self.observed_subgrad = observed_subgrad self.useC = useC self.opt_linear = opt_linear - - self.prod_score_prec_unnorm = prod_score_prec_unnorm - self.cov_rand = cov_rand + self.M1, self.M2, self.M3 = M1, M2, M3 def log_cond_density(self, opt_sample, @@ -970,10 +979,9 @@ def selective_MLE(self, Arguments passed to solver. """ - prod_score_prec = self.prod_score_prec_unnorm * dispersion - M1 = prod_score_prec - M2 = M1.dot(self.cov_rand).dot(M1.T) - M3 = M1.dot(self.opt_linear.dot(self.covariance).dot(self.opt_linear.T)).dot(M1.T) + self.M1 = self.M1 * dispersion + self.M2 = self.M2 * (dispersion**2) + self.M3 = self.M3 * (dispersion**2) return selective_MLE(observed_target, cov_target, @@ -984,9 +992,9 @@ def selective_MLE(self, self.affine_con.linear_part, self.affine_con.offset, self.opt_linear, - M1, - M2, - M3, + self.M1, + self.M2, + self.M3, self.observed_score_state + self.observed_subgrad, solve_args=solve_args, level=level, diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index e60e7feb7..5753ba668 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -411,7 +411,7 @@ def main(nsim =50): s=s, signal_fac=1.2, full_dispersion=full_dispersion, - full=False) + full=True) DF["MLE"] = pd.Series(mle) DF["Lower Conf"] = pd.Series(lower_conf) From 56ca78f60dfb969e559ebc8b5ce13f88035683c8 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 26 Jul 2021 13:32:17 -0400 Subject: [PATCH 122/187] created all necessary objects --- selectinf/randomized/posterior_inference.py | 24 +++++++++----------- selectinf/randomized/query.py | 4 ++++ selectinf/randomized/tests/test_posterior.py | 4 ++-- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 96bb86796..06879fd51 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -45,7 +45,6 @@ def __init__(self, observed_score = query.observed_score_state + query.observed_subgrad - print("dispersion ", dispersion) result, self.inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, regress_target_score, @@ -53,7 +52,9 @@ def __init__(self, ### Note for an informative prior we might want to change this... - self.cond_precision = np.linalg.inv(query.cond_cov) + cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(cond_cov) + self.cond_cov = cond_cov self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) @@ -65,16 +66,11 @@ def __init__(self, self.opt_linear = opt_linear self.observed_score = observed_score - prod_score_prec = query.prod_score_prec_unnorm * dispersion - M1 = prod_score_prec - M2 = M1.dot(query.cov_rand).dot(M1.T) - M3 = M1.dot(self.opt_linear.dot(query.cond_cov).dot(self.opt_linear.T)).dot(M1.T) + self.M1 = query.M1 * dispersion + self.M2 = query.M2 * (dispersion ** 2) + self.M3 = query.M3 * (dispersion ** 2) + self.feasible_point = query.observed_opt_state - self.M1 = M1 - self.M2 = M2 - self.M3 = M3 - - self.feasible_point = query.initial_point self.cond_mean = query.cond_mean self.linear_part = linear_part self.offset = offset @@ -159,14 +155,16 @@ def _set_marginal_parameters(self): T9 = (-T8.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) ##flipped sign of first term here T10 = T1.T.dot(T9) - self.prec_marginal = self.cond_precision - T5.dot(_Q).dot(T5) + self.prec_marginal = self.cond_precision - T5.dot(_Q).dot(T5.T) + self.linear_coef = self.cond_cov.dot(T5) + self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) r = np.linalg.inv(prec_target_nosel).dot(T10 - _P) S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) self.r = r self.S = S - #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) + print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self.prec_target_nosel = prec_target_nosel diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index d1c0b1077..557cd20e4 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -188,6 +188,10 @@ def _setup_implied_gaussian(self, M2 = M1.dot(cov_rand).dot(M1.T) M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + self.M1 = M1 + self.M2 = M2 + self.M3 = M3 + return (cond_mean, cond_cov, cond_precision, diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index fd3e3a803..319df1b83 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -83,7 +83,7 @@ def test_coverage(nsim=100): for i in range(nsim): cov_, len_ = test_Langevin(n=500, p=100, - signal_fac=1., + signal_fac=0.5, s=5, sigma=3., rho=0.2, @@ -363,6 +363,6 @@ def test_hiv_data(nsample=10000, if __name__ == "__main__": # test_hiv_data(split_proportion=0.50) - test_coverage(nsim=1) + test_coverage(nsim=10) From f8260984d981658de3872c89b39aa4772ae1584c Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 26 Jul 2021 23:06:19 -0400 Subject: [PATCH 123/187] cleaned up some more --- selectinf/randomized/posterior_inference.py | 30 +++++++++----------- selectinf/randomized/query.py | 24 ++++++---------- selectinf/randomized/tests/test_posterior.py | 3 +- 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 06879fd51..1c83153cb 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -116,12 +116,11 @@ def log_posterior(self, log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal) / 2. - _prec = self.prec_target_nosel # shorthand - log_lik = -((self.observed_target - target).T.dot(_prec).dot(self.observed_target - target)) / 2. \ + log_lik = -((self.observed_target - target).T.dot(self.prec_target_nosel).dot(self.observed_target - target)) / 2. \ - log_normalizer - grad_lik = self.S.T.dot(_prec.dot(self.observed_target) - _prec.dot(target) - self.linear_coef.T.dot( - prec_marginal.dot(soln) - conjugate_marginal)) + grad_lik = self.S.T.dot(self.prec_target_nosel.dot(self.observed_target) - self.prec_target_nosel.dot(target) + - self.linear_coef.T.dot(prec_marginal.dot(soln) - conjugate_marginal)) log_prior, grad_prior = self.prior(target_parameter) @@ -141,30 +140,27 @@ def _set_marginal_parameters(self): T1 = self.regress_target_score.T.dot(self.prec_target) T2 = T1.T.dot(self.M2.dot(T1)) T3 = T1.T.dot(self.M3.dot(T1)) + T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + T5 = T1.T.dot(self.M1.dot(self.opt_linear)) prec_target_nosel = self.prec_target + T2 - T3 + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(self.observed_target)) - _Q = np.linalg.inv(prec_target_nosel + T3) + bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - T4 = self.M1.T.dot(T1) - T5 = self.opt_linear.T.dot(T4) - T6 = self.cond_cov.dot(T5) - T7 = self.opt_linear.dot(T6) - T8 = self.M1.dot(T7) - T9 = (-T8.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) ##flipped sign of first term here - T10 = T1.T.dot(T9) + _Q = np.linalg.inv(prec_target_nosel + T3) - self.prec_marginal = self.cond_precision - T5.dot(_Q).dot(T5.T) - self.linear_coef = self.cond_cov.dot(T5) + self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) + self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) - r = np.linalg.inv(prec_target_nosel).dot(T10 - _P) + r = np.linalg.inv(prec_target_nosel).dot(self.prec_target.dot(bias_target)) S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) self.r = r self.S = S - print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) + #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self.prec_target_nosel = prec_target_nosel @@ -192,7 +188,7 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i, :] = sample.copy() - #print("sample ", i, samples[i,:]) + print("sample ", i, samples[i,:]) if i == nsample - 1: break diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 557cd20e4..6c852f8a5 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1361,6 +1361,7 @@ def selective_MLE(observed_target, solve_args={'tol': 1.e-12}, level=0.9, useC=False): + """ Selective MLE based on approximation of CGF. @@ -1406,18 +1407,15 @@ def selective_MLE(observed_target, T1 = regress_target_score.T.dot(prec_target) T2 = T1.T.dot(M2.dot(T1)) T3 = T1.T.dot(M3.dot(T1)) + T4 = M1.dot(opt_linear).dot(cond_cov).dot(opt_linear.T.dot(M1.T.dot(T1))) + T5 = T1.T.dot(M1.dot(opt_linear)) prec_target_nosel = prec_target + T2 - T3 + _P = -(T1.T.dot(M1.dot(observed_score)) + T2.dot(observed_target)) ##flipped sign of second term here - T4 = M1.T.dot(T1) - T5 = opt_linear.T.dot(T4) - T6 = cond_cov.dot(T5) - T7 = opt_linear.dot(T6) - T8 = M1.dot(T7) - T9 = (-T8.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) ##flipped sign of first term here - T10 = T1.T.dot(T9) - C = cov_target.dot(_P - T10) ##added missing _P in computing C + bias_target = cov_target.dot(T1.T.dot(-T4.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) - _P) + conjugate_arg = prec_opt.dot(cond_mean) if useC: @@ -1432,16 +1430,12 @@ def selective_MLE(observed_target, offset, **solve_args) - T11 = regress_target_score.dot(M1.dot(opt_linear)) final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ - + T11.dot(cond_mean - soln) + C + + regress_target_score.dot(M1.dot(opt_linear)).dot(cond_mean - soln) - bias_target - T12 = prec_target.dot(T11) - T13 = T3 - unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) + cov_target.dot( - _P - T12.dot(cond_mean) + T13.dot(observed_target)) + observed_info_natural = prec_target_nosel + T3 - T5.dot(hess.dot(T5.T)) - observed_info_natural = prec_target_nosel + T3 - T12.dot(hess.dot(T12.T)) + unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) - bias_target observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 319df1b83..534c208b3 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -78,6 +78,7 @@ def test_Langevin(n=500, def test_coverage(nsim=100): + np.random.seed(0) cov, len = 0., 0. for i in range(nsim): @@ -363,6 +364,6 @@ def test_hiv_data(nsample=10000, if __name__ == "__main__": # test_hiv_data(split_proportion=0.50) - test_coverage(nsim=10) + test_coverage(nsim=1) From 2edd860b9c3b316aba488ded2068544d72616511 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 27 Jul 2021 00:18:26 -0400 Subject: [PATCH 124/187] compare branches in progress --- selectinf/randomized/posterior_inference.py | 4 ++-- selectinf/randomized/tests/test_posterior.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 1c83153cb..2fb5ea2ae 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -150,7 +150,6 @@ def _set_marginal_parameters(self): bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) _Q = np.linalg.inv(prec_target_nosel + T3) - self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) @@ -163,6 +162,7 @@ def _set_marginal_parameters(self): #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self.prec_target_nosel = prec_target_nosel + print("match parameters ", r, S, prec_target_nosel, self.prec_marginal, self.linear_coef, self.offset_coef) ### sampling methods @@ -188,7 +188,7 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i, :] = sample.copy() - print("sample ", i, samples[i,:]) + #print("sample ", i, samples[i,:]) if i == nsample - 1: break diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 534c208b3..e94994591 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -17,6 +17,8 @@ def test_Langevin(n=500, randomizer_scale=1., nsample=1500, nburnin=100): + + np.random.seed(0) inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -78,7 +80,6 @@ def test_Langevin(n=500, def test_coverage(nsim=100): - np.random.seed(0) cov, len = 0., 0. for i in range(nsim): From ebbda3e8f175bbaa15a50a75195d56049d212355 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 27 Jul 2021 07:46:38 -0400 Subject: [PATCH 125/187] compare branches in progress --- selectinf/randomized/posterior_inference.py | 6 ++++-- selectinf/randomized/tests/test_posterior.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 2fb5ea2ae..80a0bc813 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -162,7 +162,8 @@ def _set_marginal_parameters(self): #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self.prec_target_nosel = prec_target_nosel - print("match parameters ", r, S, prec_target_nosel, self.prec_marginal, self.linear_coef, self.offset_coef) + #print("match parameters ", r, S, prec_target_nosel, self.prec_marginal, self.linear_coef, self.offset_coef) + print("match parameters ", np.diag(self.prec_marginal), np.diag(self.linear_coef), self.offset_coef) ### sampling methods @@ -172,6 +173,7 @@ def langevin_sampler(selective_posterior, proposal_scale=None, step=1.): state = selective_posterior.initial_estimate + print("check INI ", state) stepsize = 1. / (step * selective_posterior.ntarget) if proposal_scale is None: @@ -188,7 +190,7 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i, :] = sample.copy() - #print("sample ", i, samples[i,:]) + print("sample ", i, samples[i,:]) if i == nsample - 1: break diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index e94994591..49abbad38 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -90,8 +90,8 @@ def test_coverage(nsim=100): sigma=3., rho=0.2, randomizer_scale=1., - nsample=1500, - nburnin=100) + nsample=5, + nburnin=0) cov += cov_ len += len_ From 02d0d6db771b62dbfb299f7abe9e9e8c4d79c42c Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 27 Jul 2021 08:45:36 -0400 Subject: [PATCH 126/187] compare branches in progress --- selectinf/randomized/posterior_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 80a0bc813..bc01bf7c3 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -263,8 +263,10 @@ def next(self): def __next__(self): while True: self.posterior_ = self.gradient_map(self.state, self.scaling) + _proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) - + np.sqrt(2.) * (self.proposal_sqrt.dot(self._noise.rvs(self._shape))) * self._sqrt_step) + + np.sqrt(2.) * _proposal * self._sqrt_step) + print("check proposal ", _proposal, self.posterior_[1], np.diag(self.proposal_scale)) if not np.all(np.isfinite(self.gradient_map(candidate, self.scaling)[1])): self.stepsize *= 0.5 From d61fd1f1f6cb8926ff5577a2f09f0ff352def60a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 27 Jul 2021 09:13:27 -0400 Subject: [PATCH 127/187] compare branches in progress --- selectinf/randomized/posterior_inference.py | 15 +++++++-------- selectinf/randomized/tests/test_posterior.py | 6 +++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index bc01bf7c3..45c3742fd 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -149,21 +149,20 @@ def _set_marginal_parameters(self): bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + ###set parameters for the marginal distribution of optimization variables _Q = np.linalg.inv(prec_target_nosel + T3) self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) + ###set parameters for the marginal distribution of target r = np.linalg.inv(prec_target_nosel).dot(self.prec_target.dot(bias_target)) S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) self.r = r self.S = S - #print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) self.prec_target_nosel = prec_target_nosel - - #print("match parameters ", r, S, prec_target_nosel, self.prec_marginal, self.linear_coef, self.offset_coef) - print("match parameters ", np.diag(self.prec_marginal), np.diag(self.linear_coef), self.offset_coef) + # print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) ### sampling methods @@ -172,8 +171,8 @@ def langevin_sampler(selective_posterior, nburnin=100, proposal_scale=None, step=1.): + state = selective_posterior.initial_estimate - print("check INI ", state) stepsize = 1. / (step * selective_posterior.ntarget) if proposal_scale is None: @@ -248,7 +247,7 @@ def __init__(self, self.proposal_scale = proposal_scale self._shape = self.state.shape[0] self._sqrt_step = np.sqrt(self.stepsize) - self._noise = ndist(loc=0, scale=1) + #self._noise = ndist(loc=0, scale=1) self.sample = np.copy(initial_condition) self.scaling = scaling @@ -263,10 +262,10 @@ def next(self): def __next__(self): while True: self.posterior_ = self.gradient_map(self.state, self.scaling) - _proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) + #_proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) + _proposal = self.proposal_sqrt.dot(np.random.standard_normal(self._shape)) candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) + np.sqrt(2.) * _proposal * self._sqrt_step) - print("check proposal ", _proposal, self.posterior_[1], np.diag(self.proposal_scale)) if not np.all(np.isfinite(self.gradient_map(candidate, self.scaling)[1])): self.stepsize *= 0.5 diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 49abbad38..3aa45b86c 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -18,7 +18,6 @@ def test_Langevin(n=500, nsample=1500, nburnin=100): - np.random.seed(0) inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -80,6 +79,7 @@ def test_Langevin(n=500, def test_coverage(nsim=100): + np.random.seed(0) cov, len = 0., 0. for i in range(nsim): @@ -90,8 +90,8 @@ def test_coverage(nsim=100): sigma=3., rho=0.2, randomizer_scale=1., - nsample=5, - nburnin=0) + nsample=1500, + nburnin=100) cov += cov_ len += len_ From 8c386d8345dae37d3585405afc4c064e59429851 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 27 Jul 2021 09:36:00 -0400 Subject: [PATCH 128/187] some more tests --- selectinf/randomized/posterior_inference.py | 7 +++---- selectinf/randomized/tests/test_posterior.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 45c3742fd..c8a594ddf 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -189,7 +189,7 @@ def langevin_sampler(selective_posterior, for i, sample in enumerate(sampler): sampler.scaling = np.sqrt(selective_posterior.dispersion) samples[i, :] = sample.copy() - print("sample ", i, samples[i,:]) + #print("sample ", i, samples[i,:]) if i == nsample - 1: break @@ -247,7 +247,7 @@ def __init__(self, self.proposal_scale = proposal_scale self._shape = self.state.shape[0] self._sqrt_step = np.sqrt(self.stepsize) - #self._noise = ndist(loc=0, scale=1) + self._noise = ndist(loc=0, scale=1) self.sample = np.copy(initial_condition) self.scaling = scaling @@ -262,8 +262,7 @@ def next(self): def __next__(self): while True: self.posterior_ = self.gradient_map(self.state, self.scaling) - #_proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) - _proposal = self.proposal_sqrt.dot(np.random.standard_normal(self._shape)) + _proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) + np.sqrt(2.) * _proposal * self._sqrt_step) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 3aa45b86c..1d931d915 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -79,7 +79,7 @@ def test_Langevin(n=500, def test_coverage(nsim=100): - np.random.seed(0) + cov, len = 0., 0. for i in range(nsim): @@ -87,7 +87,7 @@ def test_coverage(nsim=100): p=100, signal_fac=0.5, s=5, - sigma=3., + sigma=2., rho=0.2, randomizer_scale=1., nsample=1500, @@ -365,6 +365,6 @@ def test_hiv_data(nsample=10000, if __name__ == "__main__": # test_hiv_data(split_proportion=0.50) - test_coverage(nsim=1) + test_coverage(nsim=20) From 9ea85fa5e8069205fff8a91fc7d13f8c9c530a0b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 3 Aug 2021 12:36:23 -0400 Subject: [PATCH 129/187] updated exact ref --- selectinf/randomized/exact_reference.py | 145 +++++++++--------- .../randomized/tests/test_exact_reference.py | 16 +- 2 files changed, 78 insertions(+), 83 deletions(-) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 429278d1e..cf91eb800 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -12,9 +12,9 @@ def __init__(self, query, observed_target, cov_target, - cov_target_score, - solve_args={'tol': 1.e-12}, - useIP=False): + regress_target_score, + dispersion, + solve_args={'tol': 1.e-12}): """ Produce p-values and confidence intervals for targets @@ -34,48 +34,52 @@ def __init__(self, Arguments passed to solver. """ - result, inverse_info = query.selective_MLE(observed_target, - cov_target, - cov_target_score, - solve_args=solve_args)[:2] + self.solve_args = solve_args - self.linear_part = query.sampler.affine_con.linear_part - self.offset = query.sampler.affine_con.offset + linear_part = query.sampler.affine_con.linear_part + offset = query.sampler.affine_con.offset - self.regress_opt = query.sampler.logdens_transform[0] - self.cond_mean = query.cond_mean - self.prec_opt = np.linalg.inv(query.cond_cov) - self.cond_cov = query.cond_cov + opt_linear = query.opt_linear - self.observed_target = observed_target - self.cov_target_score = cov_target_score + observed_score = query.observed_score_state + query.observed_subgrad + + result, inverse_info, log_ref = query.selective_MLE(observed_target, + cov_target, + regress_target_score, + dispersion) + + cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(cond_cov) + self.cond_cov = cond_cov self.cov_target = cov_target + self.prec_target = np.linalg.inv(cov_target) + + self.observed_target = observed_target + self.regress_target_score = regress_target_score + self.opt_linear = opt_linear + self.observed_score = observed_score + + self.M1 = query.M1 * dispersion + self.M2 = query.M2 * (dispersion ** 2) + self.M3 = query.M3 * (dispersion ** 2) + self.feasible_point = query.observed_opt_state - self.observed_soln = query.observed_opt_state + self.cond_mean = query.cond_mean + self.linear_part = linear_part + self.offset = offset - self.prec_randomizer = query.sampler.prec_randomizer - self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] + self.feasible_point = query.observed_opt_state self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) - if useIP == False: - ngrid = 1000 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) - else: - ngrid = 60 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) - - self.opt_linear = query.opt_linear - self.useIP = useIP + ngrid = 1000 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + self.inverse_info = inverse_info def summary(self, @@ -120,14 +124,14 @@ def summary(self, def log_reference(self, observed_target, cov_target, - cov_target_score, + regress_target_score, + linear_coef, grid): if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') prec_target = np.linalg.inv(cov_target) - regress_opt_target = self.regress_opt.dot(cov_target_score.T.dot(prec_target)) ref_hat = [] @@ -138,15 +142,15 @@ def log_reference(self, # cond_mean is "something" times D # Gamma is cov_target_score.T.dot(prec_target) - num_opt = self.prec_opt.shape[0] + num_opt = self.cond_precision.shape[0] num_con = self.linear_part.shape[0] - cond_mean_grid = (regress_opt_target.dot(np.atleast_1d(grid[k] - observed_target)) + + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) #direction for decomposing o - eta = self.prec_opt.dot(self.regress_opt.dot(cov_target_score.T)) + eta = self.cond_precision.dot(linear_coef).dot(cov_target) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) @@ -156,7 +160,7 @@ def log_reference(self, R = np.identity(num_opt) - _A.dot(eta.T) A = self.linear_part.dot(_A).reshape((-1,)) - b = -self.linear_part.dot(R).dot(self.observed_soln) + b = -self.linear_part.dot(R).dot(self.feasible_point) trunc_ = np.true_divide((self.offset + b), A) @@ -198,37 +202,24 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.cov_target_score.shape[1] + p = self.regress_target_score.shape[1] observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) log_ref = self.log_reference(observed_target_uni, cov_target_uni, - cov_target_score_uni, + regress_target_score_uni, + self.T[m], self.stat_grid[m]) - if self.useIP == False: - logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) - logW -= logW.max() - self._families.append(discrete_family(self.stat_grid[m], - np.exp(logW))) - else: - approx_fn = interp1d(self.stat_grid[m], - log_ref, - kind='quadratic', - bounds_error=False, - fill_value='extrapolate') - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) - logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) - - logW -= logW.max() - self._families.append(discrete_family(grid, - np.exp(logW))) + logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + self._families.append(discrete_family(self.stat_grid[m], + np.exp(logW))) def _pivots(self, mean_parameter, @@ -290,36 +281,42 @@ def _construct_density(self): precs = {} S = {} r = {} + T = {} - p = self.cov_target_score.shape[1] + p = self.regress_target_score.shape[1] for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) prec_target = 1. / cov_target_uni - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) + + T1 = regress_target_score_uni.T.dot(prec_target) + T2 = T1.T.dot(self.M2.dot(T1)) + T3 = T1.T.dot(self.M3.dot(T1)) + T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + + _T = self.cond_cov.dot(T5.T) - regress_score_target = cov_target_score_uni.T.dot(prec_target) - resid_score_target = (self.score_offset - regress_score_target.dot(observed_target_uni)).reshape( - (regress_score_target.shape[0],)) + prec_target_nosel = prec_target + T2 - T3 - regress_opt_target = self.regress_opt.dot(regress_score_target) - resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) - prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( - self.prec_opt).dot(regress_opt_target) + bias_target = cov_target_uni.dot(T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer - _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) - _S = np.linalg.inv(_prec).dot(prec_target) + _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + _S = np.linalg.inv(prec_target_nosel).dot(prec_target) S[m] = _S r[m] = _r precs[m] = prec_target_nosel + T[m] = _T self.precs = precs self.S = S self.r = r + self.T = T diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 7c1a5efb4..9b0b0476d 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -12,7 +12,6 @@ def test_inf(n=500, rho=0.4, randomizer_scale=1., equicorrelated=False, - useIP=False, CI=True): while True: @@ -55,7 +54,8 @@ def test_inf(n=500, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -64,8 +64,8 @@ def test_inf(n=500, exact_grid_inf = exact_grid_inference(conv, observed_target, cov_target, - cov_target_score, - useIP=useIP) + regress_target_score, + dispersion=dispersion) if CI is False: pivot = exact_grid_inf._pivots(beta_target) @@ -89,15 +89,14 @@ def main(nsim=300, CI = False): _pivot = [] for i in range(nsim): - _pivot.extend(test_inf(n=100, - p=400, + _pivot.extend(test_inf(n=400, + p=100, signal_fac=1., s=0, sigma=2., rho=0.30, randomizer_scale=0.7, equicorrelated=True, - useIP=False, CI=False)) print("iteration completed ", i) @@ -122,7 +121,6 @@ def main(nsim=300, CI = False): rho=0.30, randomizer_scale=0.7, equicorrelated=True, - useIP=False, CI=True) coverage_ += cov @@ -134,4 +132,4 @@ def main(nsim=300, CI = False): if __name__ == "__main__": - main(nsim=100, CI=True) \ No newline at end of file + main(nsim=100, CI=False) \ No newline at end of file From 2b72a5fdfc5a09d5543fc6466bf6fa7bdae4c71b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sat, 7 Aug 2021 23:47:53 -0400 Subject: [PATCH 130/187] check in progress: w master --- .../randomized/tests/test_exact_reference.py | 202 +++++++++++++----- 1 file changed, 150 insertions(+), 52 deletions(-) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 9b0b0476d..2f3e4ca66 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -43,6 +43,7 @@ def test_inf(n=500, conv = const(X, Y, W, + ridge_term=0., randomizer_scale=randomizer_scale * np.sqrt(dispersion)) signs = conv.fit() @@ -78,58 +79,155 @@ def test_inf(n=500, mle_length = 1.65*2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) return np.mean(coverage), np.mean(length), np.mean(mle_length) -def main(nsim=300, CI = False): - - if CI is False: - - import matplotlib as mpl - mpl.use('tkagg') - import matplotlib.pyplot as plt - from statsmodels.distributions.empirical_distribution import ECDF - - _pivot = [] - for i in range(nsim): - _pivot.extend(test_inf(n=400, - p=100, - signal_fac=1., - s=0, - sigma=2., - rho=0.30, - randomizer_scale=0.7, - equicorrelated=True, - CI=False)) - - print("iteration completed ", i) - - plt.clf() - ecdf_pivot = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_pivot(grid), c='blue') - plt.plot(grid, grid, 'k--') - plt.show() - - else: - coverage_ = 0. - length_ = 0. - mle_length_= 0. - for n in range(nsim): - cov, len, mle_len = test_inf(n=400, - p=100, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.30, - randomizer_scale=0.7, - equicorrelated=True, - CI=True) - - coverage_ += cov - length_ += len - mle_length_ += mle_len - print("coverage so far ", coverage_ / (n + 1.)) - print("lengths so far ", length_ / (n + 1.), mle_length_/ (n + 1.)) - print("iteration completed ", n + 1) +# def main(nsim=300, CI = False): +# +# if CI is False: +# +# import matplotlib as mpl +# mpl.use('tkagg') +# import matplotlib.pyplot as plt +# from statsmodels.distributions.empirical_distribution import ECDF +# +# _pivot = [] +# for i in range(nsim): +# _pivot.extend(test_inf(n=400, +# p=100, +# signal_fac=1., +# s=0, +# sigma=2., +# rho=0.30, +# randomizer_scale=0.5, +# equicorrelated=True, +# CI=False)) +# +# print("iteration completed ", i) +# +# plt.clf() +# ecdf_pivot = ECDF(np.asarray(_pivot)) +# grid = np.linspace(0, 1, 101) +# plt.plot(grid, ecdf_pivot(grid), c='blue') +# plt.plot(grid, grid, 'k--') +# plt.show() +# +# else: +# coverage_ = 0. +# length_ = 0. +# mle_length_= 0. +# for n in range(nsim): +# cov, len, mle_len = test_inf(n=400, +# p=100, +# signal_fac=0.5, +# s=5, +# sigma=2., +# rho=0.30, +# randomizer_scale=0.7, +# equicorrelated=True, +# CI=True) +# +# coverage_ += cov +# length_ += len +# mle_length_ += mle_len +# print("coverage so far ", coverage_ / (n + 1.)) +# print("lengths so far ", length_ / (n + 1.), mle_length_/ (n + 1.)) +# print("iteration completed ", n + 1) + + +def test_selected_instance(seedn, + n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1., + equicorrelated=False): + while True: + np.random.seed(seedn) + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=equicorrelated, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + + if n > (2 * p): + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = sigma_ ** 2 + + eps = np.random.standard_normal((n, 2000)) * Y.std() + W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) + + conv = const(X, + Y, + W, + ridge_term=0., + randomizer_scale=randomizer_scale * np.sqrt(dispersion)) + + signs = conv.fit() + nonzero = signs != 0 + print("size of selected set ", nonzero.sum()) + + if nonzero.sum() > 0: + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + + (observed_target, + cov_target, + regress_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + exact_grid_inf = exact_grid_inference(conv, + observed_target, + cov_target, + regress_target_score, + dispersion=dispersion) + + lci, uci = exact_grid_inf._intervals(level=0.90) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + mle_length = 1.65 * 2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) + return coverage, length, mle_length + +def main(nsim =50): + + import pandas as pd + column_names = ["Experiment Replicate", "Coverage", "Length-ER", "Length-MLE"] + master_DF = pd.DataFrame(columns=column_names) + DF = pd.DataFrame(columns=column_names) + + n, p, s = 500, 100, 5 + for i in range(nsim): + full_dispersion = True + cov, len_er, len_mle = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2) + DF["Coverage"] = pd.Series(cov) + DF["Length-ER"] = pd.Series(len_er) + DF["Length-MLE"] = pd.Series(len_mle) + DF["Experiment Replicate"] = pd.Series((i*np.ones(len(cov),int)).tolist()) + + master_DF = DF.append(master_DF, ignore_index=True) + + import os + outpath = os.path.dirname(__file__) + + outfile_mse_html = os.path.join(outpath, "compare_er.html") + outfile_mse_csv = os.path.join(outpath, "compare_er.csv") + + master_DF.to_html(outfile_mse_html, index=False) + master_DF.to_csv(outfile_mse_csv, index=False) if __name__ == "__main__": - main(nsim=100, CI=False) \ No newline at end of file + main(nsim=10) \ No newline at end of file From 5eadfe6181565d6ab4a2750ccd99d9ce1b342c39 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 9 Aug 2021 00:31:37 -0400 Subject: [PATCH 131/187] updated approx reference --- selectinf/randomized/approx_reference.py | 101 +++--- selectinf/randomized/exact_reference.py | 14 +- .../randomized/tests/test_approx_reference.py | 27 +- .../randomized/tests/test_exact_reference.py | 302 +++++++++--------- 4 files changed, 229 insertions(+), 215 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 40e7363c4..f60f7b2a8 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -13,7 +13,8 @@ def __init__(self, query, observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, solve_args={'tol': 1.e-12}, useIP=False): @@ -37,27 +38,39 @@ def __init__(self, self.solve_args = solve_args - result, inverse_info = query.selective_MLE(observed_target, - cov_target, - cov_target_score, - solve_args=solve_args)[:2] + linear_part = query.sampler.affine_con.linear_part + offset = query.sampler.affine_con.offset - self.linear_part = query.sampler.affine_con.linear_part - self.offset = query.sampler.affine_con.offset + opt_linear = query.opt_linear - self.regress_opt = query.sampler.logdens_transform[0] - self.cond_mean = query.cond_mean - self.prec_opt = np.linalg.inv(query.cond_cov) - self.cond_cov = query.cond_cov + observed_score = query.observed_score_state + query.observed_subgrad - self.observed_target = observed_target - self.cov_target_score = cov_target_score + result, inverse_info, log_ref = query.selective_MLE(observed_target, + cov_target, + regress_target_score, + dispersion) + + cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(cond_cov) + self.cond_cov = cond_cov self.cov_target = cov_target + self.prec_target = np.linalg.inv(cov_target) + + self.observed_target = observed_target + self.regress_target_score = regress_target_score + self.opt_linear = opt_linear + self.observed_score = observed_score + + self.M1 = query.M1 * dispersion + self.M2 = query.M2 * (dispersion ** 2) + self.M3 = query.M3 * (dispersion ** 2) + self.feasible_point = query.observed_opt_state - self.observed_soln = query.observed_opt_state + self.cond_mean = query.cond_mean + self.linear_part = linear_part + self.offset = offset - self.prec_randomizer = query.sampler.prec_randomizer - self.score_offset = query.observed_score_state + query.sampler.logdens_transform[1] + self.feasible_point = query.observed_opt_state self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -79,6 +92,7 @@ def __init__(self, self.opt_linear = query.opt_linear self.useIP = useIP + self.inverse_info = inverse_info def summary(self, alternatives=None, @@ -122,7 +136,7 @@ def summary(self, def _approx_log_reference(self, observed_target, cov_target, - cov_target_score, + linear_coef, grid): """ @@ -131,11 +145,9 @@ def _approx_log_reference(self, if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(cov_target) - regress_opt_target = self.regress_opt.dot(cov_target_score.T.dot(prec_target)) - ref_hat = [] solver = solve_barrier_affine_py + for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, # regress_opt_target is "something" times Gamma, @@ -143,13 +155,12 @@ def _approx_log_reference(self, # cond_mean is "something" times D # Gamma is cov_target_score.T.dot(prec_target) - cond_mean_grid = (regress_opt_target.dot(np.atleast_1d(grid[k] - observed_target)) + - self.cond_mean) - conjugate_arg = self.prec_opt.dot(cond_mean_grid) + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) + conjugate_arg = self.cond_precision.dot(cond_mean_grid) val, _, _ = solver(conjugate_arg, - self.prec_opt, - self.observed_soln, + self.cond_precision, + self.feasible_point, self.linear_part, self.offset, **self.solve_args) @@ -165,26 +176,25 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.cov_target_score.shape[1] - observed_target_uni = (self.observed_target[m]).reshape((1,)) + observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) approx_log_ref = self._approx_log_reference(observed_target_uni, cov_target_uni, - cov_target_score_uni, + self.T[m], self.stat_grid[m]) - if self.useIP == False: + logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) logW -= logW.max() self._families.append(discrete_family(self.stat_grid[m], np.exp(logW))) else: + approx_fn = interp1d(self.stat_grid[m], approx_log_ref, kind='quadratic', @@ -277,33 +287,40 @@ def _construct_density(self): precs = {} S = {} r = {} + T = {} - p = self.cov_target_score.shape[1] + p = self.regress_target_score.shape[1] for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) prec_target = 1. / cov_target_uni - cov_target_score_uni = self.cov_target_score[m, :].reshape((1, p)) + regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) + + T1 = regress_target_score_uni.T.dot(prec_target) + T2 = T1.T.dot(self.M2.dot(T1)) + T3 = T1.T.dot(self.M3.dot(T1)) + T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + + _T = self.cond_cov.dot(T5.T) - regress_score_target = cov_target_score_uni.T.dot(prec_target) - resid_score_target = (self.score_offset - regress_score_target.dot(observed_target_uni)).reshape( - (regress_score_target.shape[0],)) + prec_target_nosel = prec_target + T2 - T3 - regress_opt_target = self.regress_opt.dot(regress_score_target) - resid_mean_opt_target = (self.cond_mean - regress_opt_target.dot(observed_target_uni)).reshape((regress_opt_target.shape[0],)) + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) - prec_target_nosel = prec_target + (regress_score_target.T.dot(regress_score_target) * self.prec_randomizer) - regress_opt_target.T.dot( - self.prec_opt).dot(regress_opt_target) + bias_target = cov_target_uni.dot( + T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - _P = regress_score_target.T.dot(resid_score_target) * self.prec_randomizer - _r = (1. / _prec).dot(regress_opt_target.T.dot(self.prec_opt).dot(resid_mean_opt_target) - _P) - _S = np.linalg.inv(_prec).dot(prec_target) + _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + _S = np.linalg.inv(prec_target_nosel).dot(prec_target) S[m] = _S r[m] = _r precs[m] = prec_target_nosel + T[m] = _T self.precs = precs self.S = S self.r = r + self.T = T \ No newline at end of file diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index cf91eb800..df90eacec 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -44,9 +44,9 @@ def __init__(self, observed_score = query.observed_score_state + query.observed_subgrad result, inverse_info, log_ref = query.selective_MLE(observed_target, - cov_target, - regress_target_score, - dispersion) + cov_target, + regress_target_score, + dispersion) cond_cov = query.cond_cov self.cond_precision = np.linalg.inv(cond_cov) @@ -124,15 +124,12 @@ def summary(self, def log_reference(self, observed_target, cov_target, - regress_target_score, linear_coef, grid): if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') - prec_target = np.linalg.inv(cov_target) - ref_hat = [] for k in range(grid.shape[0]): @@ -202,17 +199,14 @@ def _construct_families(self): self._families = [] for m in range(self.ntarget): - p = self.regress_target_score.shape[1] - observed_target_uni = (self.observed_target[m]).reshape((1,)) + observed_target_uni = (self.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) var_target = 1. / ((self.precs[m])[0, 0]) log_ref = self.log_reference(observed_target_uni, cov_target_uni, - regress_target_score_uni, self.T[m], self.stat_grid[m]) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index bbfe4b719..5d9458809 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -109,7 +109,8 @@ def test_approx_pivot(n=500, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -118,10 +119,10 @@ def test_approx_pivot(n=500, approximate_grid_inf = approximate_grid_inference(conv, observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion=dispersion, useIP=useIP) - pivot = approximate_grid_inf._approx_pivots(beta_target) return pivot @@ -134,7 +135,8 @@ def test_approx_ci(n=500, sigma=2., rho=0.4, randomizer_scale=1., - level=0.9): + level=0.9, + useIP=False): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -167,7 +169,8 @@ def test_approx_ci(n=500, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -175,7 +178,8 @@ def test_approx_ci(n=500, result, inverse_info = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[:2] + regress_target_score, + dispersion)[:2] _scale = 4 * np.sqrt(np.diag(inverse_info)) scale_ = np.max(_scale) @@ -184,15 +188,12 @@ def test_approx_ci(n=500, approximate_grid_inf = approximate_grid_inference(conv, observed_target, cov_target, - cov_target_score, - useIP=False) + regress_target_score, + dispersion=dispersion, + useIP=useIP) lci, uci = approximate_grid_inf._approx_intervals(level) - S = conv.approximate_grid_inference(observed_target, - cov_target, - cov_target_score) - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) coverage = (lci < beta_target) * (uci > beta_target) length = uci - lci @@ -214,7 +215,7 @@ def main(nsim=300, CI = False): _pivot.extend(test_approx_pivot(n=100, p=400, signal_fac=0.5, - s=0, + s=5, sigma=2., rho=0.30, randomizer_scale=1., diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 2f3e4ca66..ef66cc963 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -79,155 +79,157 @@ def test_inf(n=500, mle_length = 1.65*2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) return np.mean(coverage), np.mean(length), np.mean(mle_length) -# def main(nsim=300, CI = False): -# -# if CI is False: -# -# import matplotlib as mpl -# mpl.use('tkagg') -# import matplotlib.pyplot as plt -# from statsmodels.distributions.empirical_distribution import ECDF -# -# _pivot = [] -# for i in range(nsim): -# _pivot.extend(test_inf(n=400, -# p=100, -# signal_fac=1., -# s=0, -# sigma=2., -# rho=0.30, -# randomizer_scale=0.5, -# equicorrelated=True, -# CI=False)) -# -# print("iteration completed ", i) -# -# plt.clf() -# ecdf_pivot = ECDF(np.asarray(_pivot)) -# grid = np.linspace(0, 1, 101) -# plt.plot(grid, ecdf_pivot(grid), c='blue') -# plt.plot(grid, grid, 'k--') -# plt.show() -# -# else: -# coverage_ = 0. -# length_ = 0. -# mle_length_= 0. -# for n in range(nsim): -# cov, len, mle_len = test_inf(n=400, -# p=100, -# signal_fac=0.5, -# s=5, -# sigma=2., -# rho=0.30, -# randomizer_scale=0.7, -# equicorrelated=True, -# CI=True) -# -# coverage_ += cov -# length_ += len -# mle_length_ += mle_len -# print("coverage so far ", coverage_ / (n + 1.)) -# print("lengths so far ", length_ / (n + 1.), mle_length_/ (n + 1.)) -# print("iteration completed ", n + 1) - - -def test_selected_instance(seedn, - n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - equicorrelated=False): - - while True: - np.random.seed(seedn) - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=equicorrelated, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - - if n > (2 * p): - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - else: - dispersion = sigma_ ** 2 - - eps = np.random.standard_normal((n, 2000)) * Y.std() - W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) - - conv = const(X, - Y, - W, - ridge_term=0., - randomizer_scale=randomizer_scale * np.sqrt(dispersion)) - - signs = conv.fit() - nonzero = signs != 0 - print("size of selected set ", nonzero.sum()) - - if nonzero.sum() > 0: - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - exact_grid_inf = exact_grid_inference(conv, - observed_target, - cov_target, - regress_target_score, - dispersion=dispersion) - - lci, uci = exact_grid_inf._intervals(level=0.90) - coverage = (lci < beta_target) * (uci > beta_target) - length = uci - lci - mle_length = 1.65 * 2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) - return coverage, length, mle_length - -def main(nsim =50): - - import pandas as pd - column_names = ["Experiment Replicate", "Coverage", "Length-ER", "Length-MLE"] - master_DF = pd.DataFrame(columns=column_names) - DF = pd.DataFrame(columns=column_names) - - n, p, s = 500, 100, 5 - for i in range(nsim): - full_dispersion = True - cov, len_er, len_mle = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2) - DF["Coverage"] = pd.Series(cov) - DF["Length-ER"] = pd.Series(len_er) - DF["Length-MLE"] = pd.Series(len_mle) - DF["Experiment Replicate"] = pd.Series((i*np.ones(len(cov),int)).tolist()) - - master_DF = DF.append(master_DF, ignore_index=True) - - import os - outpath = os.path.dirname(__file__) - - outfile_mse_html = os.path.join(outpath, "compare_er.html") - outfile_mse_csv = os.path.join(outpath, "compare_er.csv") - - master_DF.to_html(outfile_mse_html, index=False) - master_DF.to_csv(outfile_mse_csv, index=False) +def main(nsim=300, CI = False): + + if CI is False: + + import matplotlib as mpl + mpl.use('tkagg') + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF + + _pivot = [] + for i in range(nsim): + _pivot.extend(test_inf(n=400, + p=100, + signal_fac=1., + s=0, + sigma=2., + rho=0.30, + randomizer_scale=0.5, + equicorrelated=True, + CI=False)) + + print("iteration completed ", i) + + plt.clf() + ecdf_pivot = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_pivot(grid), c='blue') + plt.plot(grid, grid, 'k--') + plt.show() + + else: + coverage_ = 0. + length_ = 0. + mle_length_= 0. + for n in range(nsim): + cov, len, mle_len = test_inf(n=400, + p=100, + signal_fac=1, + s=0, + sigma=2., + rho=0.30, + randomizer_scale=0.5, + equicorrelated=True, + CI=True) + + coverage_ += cov + length_ += len + mle_length_ += mle_len + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.), mle_length_/ (n + 1.)) + print("iteration completed ", n + 1) if __name__ == "__main__": - main(nsim=10) \ No newline at end of file + main(nsim=500, CI=False) + +# def test_selected_instance(seedn, +# n=500, +# p=100, +# signal_fac=1., +# s=5, +# sigma=2., +# rho=0.4, +# randomizer_scale=1., +# equicorrelated=False): +# +# while True: +# np.random.seed(seedn) +# inst, const = gaussian_instance, lasso.gaussian +# signal = np.sqrt(signal_fac * 2 * np.log(p)) +# +# X, Y, beta = inst(n=n, +# p=p, +# signal=signal, +# s=s, +# equicorrelated=equicorrelated, +# rho=rho, +# sigma=sigma, +# random_signs=True)[:3] +# +# n, p = X.shape +# +# sigma_ = np.std(Y) +# +# if n > (2 * p): +# dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) +# else: +# dispersion = sigma_ ** 2 +# +# eps = np.random.standard_normal((n, 2000)) * Y.std() +# W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) +# +# conv = const(X, +# Y, +# W, +# ridge_term=0., +# randomizer_scale=randomizer_scale * np.sqrt(dispersion)) +# +# signs = conv.fit() +# nonzero = signs != 0 +# print("size of selected set ", nonzero.sum()) +# +# if nonzero.sum() > 0: +# beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) +# +# (observed_target, +# cov_target, +# regress_target_score, +# dispersion, +# alternatives) = selected_targets(conv.loglike, +# conv._W, +# nonzero, +# dispersion=dispersion) +# +# exact_grid_inf = exact_grid_inference(conv, +# observed_target, +# cov_target, +# regress_target_score, +# dispersion=dispersion) +# +# lci, uci = exact_grid_inf._intervals(level=0.90) +# coverage = (lci < beta_target) * (uci > beta_target) +# length = uci - lci +# mle_length = 1.65 * 2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) +# return coverage, length, mle_length +# +# def main(nsim =50): +# +# import pandas as pd +# column_names = ["Experiment Replicate", "Coverage", "Length-ER", "Length-MLE"] +# master_DF = pd.DataFrame(columns=column_names) +# DF = pd.DataFrame(columns=column_names) +# +# n, p, s = 500, 100, 5 +# for i in range(nsim): +# full_dispersion = True +# cov, len_er, len_mle = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2) +# DF["Coverage"] = pd.Series(cov) +# DF["Length-ER"] = pd.Series(len_er) +# DF["Length-MLE"] = pd.Series(len_mle) +# DF["Experiment Replicate"] = pd.Series((i*np.ones(len(cov),int)).tolist()) +# +# master_DF = DF.append(master_DF, ignore_index=True) +# +# import os +# outpath = os.path.dirname(__file__) +# +# outfile_mse_html = os.path.join(outpath, "compare_er.html") +# outfile_mse_csv = os.path.join(outpath, "compare_er.csv") +# +# master_DF.to_html(outfile_mse_html, index=False) +# master_DF.to_csv(outfile_mse_csv, index=False) +# +# if __name__ == "__main__": +# main(nsim=10) \ No newline at end of file From bb2802caee33cd17550de47adbd289904a1d139f Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 10 Aug 2021 10:23:50 -0400 Subject: [PATCH 132/187] clean up for all the tests --- selectinf/randomized/approx_reference.py | 1 - selectinf/randomized/posterior_inference.py | 1 - .../randomized/tests/test_approx_reference.py | 199 ++++------------- .../randomized/tests/test_exact_reference.py | 104 +-------- selectinf/randomized/tests/test_posterior.py | 49 +++-- .../tests/test_selective_MLE_high.py | 206 ++++-------------- 6 files changed, 118 insertions(+), 442 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index f60f7b2a8..ac9868136 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -243,7 +243,6 @@ def _approx_pivots(self, var_target = 1. / ((self.precs[m])[0, 0]) mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] - #print("mean ", np.allclose(mean[0], mean_parameter[m]), self.r[m], self.S[m]) # construction of pivot from families follows `selectinf.learning.core` _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index c8a594ddf..4395e4499 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -162,7 +162,6 @@ def _set_marginal_parameters(self): self.r = r self.S = S self.prec_target_nosel = prec_target_nosel - # print("check parameters for selected+lasso ", np.allclose(np.diag(S), np.ones(S.shape[0])), np.allclose(r, np.zeros(r.shape[0]))) ### sampling methods diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 5d9458809..839cee8ca 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -4,70 +4,16 @@ from ..lasso import lasso, selected_targets from ..approx_reference import approximate_grid_inference -def test_summary(n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1.): - - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * dispersion) - - signs = conv.fit() - nonzero = signs != 0 - - if nonzero.sum()>0: - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - inverse_info = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[1] - - S = conv.approximate_grid_inference(observed_target, - cov_target, - cov_target_score, - alternatives=alternatives) - -def test_approx_pivot(n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - equicorrelated=False, - useIP=False): +def test_inf(n=500, + p=100, + signal_fac=1., + s=5, + sigma=2., + rho=0.4, + randomizer_scale=1., + equicorrelated=False, + useIP=False, + CI=False): inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -97,8 +43,8 @@ def test_approx_pivot(n=500, conv = const(X, Y, W, - ridge_term=0.) - #randomizer_scale=randomizer_scale * sigma_) + ridge_term=0., + randomizer_scale=randomizer_scale * sigma_) signs = conv.fit() nonzero = signs != 0 @@ -123,84 +69,17 @@ def test_approx_pivot(n=500, dispersion=dispersion, useIP=useIP) - pivot = approximate_grid_inf._approx_pivots(beta_target) - - return pivot - - -def test_approx_ci(n=500, - p=100, - signal_fac=1., - s=5, - sigma=2., - rho=0.4, - randomizer_scale=1., - level=0.9, - useIP=False): - - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) - - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=False, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - n, p = X.shape - - sigma_ = np.std(Y) - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - randomizer_scale=randomizer_scale * sigma_) - - signs = conv.fit() - nonzero = signs != 0 - - if nonzero.sum()>0: - - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - result, inverse_info = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, - dispersion)[:2] - - _scale = 4 * np.sqrt(np.diag(inverse_info)) - scale_ = np.max(_scale) - ngrid = int(2 * scale_/0.1) - - approximate_grid_inf = approximate_grid_inference(conv, - observed_target, - cov_target, - regress_target_score, - dispersion=dispersion, - useIP=useIP) - - lci, uci = approximate_grid_inf._approx_intervals(level) - - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - coverage = (lci < beta_target) * (uci > beta_target) - length = uci - lci - - return np.mean(coverage), np.mean(length) + if CI is False: + pivot = approximate_grid_inf._approx_pivots(beta_target) + return pivot + else: + lci, uci = approximate_grid_inf._approx_intervals(level=0.90) + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + return np.mean(coverage), np.mean(length) def main(nsim=300, CI = False): @@ -212,15 +91,16 @@ def main(nsim=300, CI = False): if CI is False: _pivot = [] for i in range(nsim): - _pivot.extend(test_approx_pivot(n=100, - p=400, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.30, - randomizer_scale=1., - equicorrelated=True, - useIP=True)) + _pivot.extend(test_inf(n=100, + p=400, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.30, + randomizer_scale=1., + equicorrelated=True, + useIP=True, + CI=False)) print("iteration completed ", i) @@ -235,13 +115,16 @@ def main(nsim=300, CI = False): coverage_ = 0. length_ = 0. for n in range(nsim): - cov, len = test_approx_ci(n=500, - p=100, - signal_fac=1., - s=5, - sigma=3., - rho=0.4, - randomizer_scale=1.) + cov, len = test_inf(n=100, + p=400, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.30, + randomizer_scale=1., + equicorrelated=True, + useIP=True, + CI=True) coverage_ += cov length_ += len diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index ef66cc963..74e2b272e 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -90,8 +90,8 @@ def main(nsim=300, CI = False): _pivot = [] for i in range(nsim): - _pivot.extend(test_inf(n=400, - p=100, + _pivot.extend(test_inf(n=100, + p=400, signal_fac=1., s=0, sigma=2., @@ -133,103 +133,3 @@ def main(nsim=300, CI = False): if __name__ == "__main__": main(nsim=500, CI=False) - -# def test_selected_instance(seedn, -# n=500, -# p=100, -# signal_fac=1., -# s=5, -# sigma=2., -# rho=0.4, -# randomizer_scale=1., -# equicorrelated=False): -# -# while True: -# np.random.seed(seedn) -# inst, const = gaussian_instance, lasso.gaussian -# signal = np.sqrt(signal_fac * 2 * np.log(p)) -# -# X, Y, beta = inst(n=n, -# p=p, -# signal=signal, -# s=s, -# equicorrelated=equicorrelated, -# rho=rho, -# sigma=sigma, -# random_signs=True)[:3] -# -# n, p = X.shape -# -# sigma_ = np.std(Y) -# -# if n > (2 * p): -# dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) -# else: -# dispersion = sigma_ ** 2 -# -# eps = np.random.standard_normal((n, 2000)) * Y.std() -# W = 0.7 * np.median(np.abs(X.T.dot(eps)).max(1)) -# -# conv = const(X, -# Y, -# W, -# ridge_term=0., -# randomizer_scale=randomizer_scale * np.sqrt(dispersion)) -# -# signs = conv.fit() -# nonzero = signs != 0 -# print("size of selected set ", nonzero.sum()) -# -# if nonzero.sum() > 0: -# beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) -# -# (observed_target, -# cov_target, -# regress_target_score, -# dispersion, -# alternatives) = selected_targets(conv.loglike, -# conv._W, -# nonzero, -# dispersion=dispersion) -# -# exact_grid_inf = exact_grid_inference(conv, -# observed_target, -# cov_target, -# regress_target_score, -# dispersion=dispersion) -# -# lci, uci = exact_grid_inf._intervals(level=0.90) -# coverage = (lci < beta_target) * (uci > beta_target) -# length = uci - lci -# mle_length = 1.65 * 2 * np.sqrt(np.diag(exact_grid_inf.inverse_info)) -# return coverage, length, mle_length -# -# def main(nsim =50): -# -# import pandas as pd -# column_names = ["Experiment Replicate", "Coverage", "Length-ER", "Length-MLE"] -# master_DF = pd.DataFrame(columns=column_names) -# DF = pd.DataFrame(columns=column_names) -# -# n, p, s = 500, 100, 5 -# for i in range(nsim): -# full_dispersion = True -# cov, len_er, len_mle = test_selected_instance(seedn=i, n=n, p=p, s=s, signal_fac=1.2) -# DF["Coverage"] = pd.Series(cov) -# DF["Length-ER"] = pd.Series(len_er) -# DF["Length-MLE"] = pd.Series(len_mle) -# DF["Experiment Replicate"] = pd.Series((i*np.ones(len(cov),int)).tolist()) -# -# master_DF = DF.append(master_DF, ignore_index=True) -# -# import os -# outpath = os.path.dirname(__file__) -# -# outfile_mse_html = os.path.join(outpath, "compare_er.html") -# outfile_mse_csv = os.path.join(outpath, "compare_er.csv") -# -# master_DF.to_html(outfile_mse_html, index=False) -# master_DF.to_csv(outfile_mse_csv, index=False) -# -# if __name__ == "__main__": -# main(nsim=10) \ No newline at end of file diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 1d931d915..e7e410512 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -115,17 +115,19 @@ def test_instance(nsample=100, nburnin=50): M = E.copy() M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, cov_target, - cov_target_score, - alternatives) = selected_targets(L.loglike, + regress_target_score, + dispersion, + alternatives)= selected_targets(L.loglike, L._W, M, dispersion=dispersion) posterior_inf = L.posterior(observed_target, cov_target, - cov_target_score, + regress_target_score, dispersion=dispersion) samples = langevin_sampler(posterior_inf, @@ -163,9 +165,11 @@ def test_flexible_prior1(nsample=100, nburnin=50): M = E.copy() M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(L.loglike, L._W, M, @@ -181,11 +185,13 @@ def prior(target_parameter): seed_state = np.random.get_state() np.random.set_state(seed_state) Z1 = np.random.standard_normal() + posterior_inf1 = L.posterior(observed_target, - cov_target, - cov_target_score, - dispersion=dispersion, - prior=prior) + cov_target, + regress_target_score, + dispersion=dispersion, + prior=prior) + W1 = np.random.standard_normal() samples1 = langevin_sampler(posterior_inf1, nsample=nsample, @@ -195,8 +201,9 @@ def prior(target_parameter): Z2 = np.random.standard_normal() posterior_inf2 = L.posterior(observed_target, cov_target, - cov_target_score, + regress_target_score, dispersion=dispersion) + W2 = np.random.standard_normal() samples2 = langevin_sampler(posterior_inf2, nsample=nsample, @@ -222,9 +229,11 @@ def test_flexible_prior2(nsample=1000, nburnin=50): M = E.copy() M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(L.loglike, L._W, M, @@ -238,10 +247,11 @@ def prior(target_parameter): return log_prior, grad_prior posterior_inf = L.posterior(observed_target, - cov_target, - cov_target_score, - dispersion=dispersion, - prior=prior) + cov_target, + regress_target_score, + dispersion=dispersion, + prior=prior) + adaptive_proposal = np.linalg.inv(np.linalg.inv(posterior_inf.inverse_info) + np.identity(posterior_inf.inverse_info.shape[0]) / 0.05 ** 2) samples = langevin_sampler(posterior_inf, @@ -285,7 +295,8 @@ def test_hiv_data(nsample=10000, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -293,17 +304,19 @@ def test_hiv_data(nsample=10000, mle, inverse_info = conv.selective_MLE(observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, level=level, solve_args={'tol': 1.e-12})[:2] approx_inf = conv.approximate_grid_inference(observed_target, cov_target, - cov_target_score) + regress_target_score, + dispersion) posterior_inf = conv.posterior(observed_target, cov_target, - cov_target_score, + regress_target_score, dispersion=dispersion) samples_langevin = langevin_sampler(posterior_inf, diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 5753ba668..0d9ea51ac 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -66,7 +66,8 @@ def test_full_targets(n=200, if n > p: (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = full_targets(conv.loglike, conv._W, nonzero, @@ -74,7 +75,8 @@ def test_full_targets(n=200, else: (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = debiased_targets(conv.loglike, conv._W, nonzero, @@ -83,7 +85,9 @@ def test_full_targets(n=200, result = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[0] + regress_target_score, + dispersion)[0] + pval = result['pvalue'] estimate = result['MLE'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -144,7 +148,8 @@ def test_selected_targets(n=2000, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -152,8 +157,9 @@ def test_selected_targets(n=2000, result = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[0] - estimate = result['MLE'] + regress_target_score, + dispersion)[0] + pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -183,7 +189,8 @@ def test_instance(): dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(L.loglike, L._W, M, @@ -193,16 +200,14 @@ def test_instance(): result = L.selective_MLE(observed_target, cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] + regress_target_score, + dispersion)[0] + intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - print("observed_opt_state ", L.observed_opt_state) - # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) return coverage @@ -219,7 +224,6 @@ def test_instance(): def test_selected_targets_disperse(n=500, p=100, - signal_fac=1., s=5, sigma=1., rho=0.4, @@ -242,10 +246,6 @@ def test_selected_targets_disperse(n=500, sigma=sigma, random_signs=True)[:3] - idx = np.arange(p) - sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) - print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) - n, p = X.shape sigma_ = np.std(Y) @@ -267,7 +267,8 @@ def test_selected_targets_disperse(n=500, (observed_target, cov_target, - cov_target_score, + regress_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, conv._W, nonzero, @@ -275,7 +276,8 @@ def test_selected_targets_disperse(n=500, result = conv.selective_MLE(observed_target, cov_target, - cov_target_score)[0] + regress_target_score, + dispersion)[0] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -287,150 +289,30 @@ def test_selected_targets_disperse(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -# def main(nsim=500, full=False): -# P0, PA, cover, length_int = [], [], [], [] -# from statsmodels.distributions import ECDF -# -# n, p, s = 500, 100, 0 -# -# for i in range(nsim): -# if full: -# if n > p: -# full_dispersion = True -# else: -# full_dispersion = False -# p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) -# avg_length = intervals[:, 1] - intervals[:, 0] -# else: -# full_dispersion = True -# p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) -# avg_length = intervals[:, 1] - intervals[:, 0] -# -# cover.extend(cover_) -# P0.extend(p0) -# PA.extend(pA) -# # print( -# # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), -# # np.mean(avg_length), 'null pvalue + power + length') -# print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) - - -def test_mle_inference(seedn, - n=2000, - p=200, - signal_fac=1.2, - s=5, - sigma=2, - rho=0.7, - randomizer_scale=1., - full_dispersion=True, - full=False): - """ - Compare to R randomized lasso - """ +def main(nsim=500, full=False): + P0, PA, cover, length_int = [], [], [], [] + from statsmodels.distributions import ECDF - inst, const = gaussian_instance, lasso.gaussian - signal = np.sqrt(signal_fac * 2 * np.log(p)) + n, p, s = 500, 100, 0 - while True: - np.random.seed(seed=seedn) - X, Y, beta = inst(n=n, - p=p, - signal=signal, - s=s, - equicorrelated=True, - rho=rho, - sigma=sigma, - random_signs=True)[:3] - - idx = np.arange(p) - sigmaX = rho ** np.abs(np.subtract.outer(idx, idx)) - snr = beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n) - print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n)) - - n, p = X.shape - - sigma_ = np.std(Y) - W = 0.8 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ - - conv = const(X, - Y, - W, - #ridge_term=0., - randomizer_scale=randomizer_scale * sigma_) - - signs = conv.fit() - nonzero = signs != 0 - print("dimensions", n, p, nonzero.sum()) - - if nonzero.sum() > 0: - dispersion = None - if full_dispersion: - dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - - if full: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - else: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score, - dispersion)[0] - - return result['MLE'], result['lower_confidence'], result['upper_confidence'], snr - -def main(nsim =50): - - import pandas as pd - column_names = ["Experiment Replicate", "MLE", "Lower Conf", "Upper Conf", "SNR"] - master_DF = pd.DataFrame(columns=column_names) - DF = pd.DataFrame(columns=column_names) - - n, p, s = 500, 100, 5 for i in range(nsim): - full_dispersion = True - mle, lower_conf, upper_conf, snr = test_mle_inference(seedn=i, - n=n, - p=p, - s=s, - signal_fac=1.2, - full_dispersion=full_dispersion, - full=True) - - DF["MLE"] = pd.Series(mle) - DF["Lower Conf"] = pd.Series(lower_conf) - DF["Upper Conf"] = pd.Series(upper_conf) - DF["Experiment Replicate"] = pd.Series((i*np.ones(len(mle),int)).tolist()) - DF["SNR"] = pd.Series((snr * np.ones(len(mle))).tolist()) - - master_DF = DF.append(master_DF, ignore_index=True) - - import os - outpath = os.path.dirname(__file__) - - outfile_mse_html = os.path.join(outpath, "compare_mle.html") - outfile_mse_csv = os.path.join(outpath, "compare_mle.csv") - - master_DF.to_html(outfile_mse_html, index=False) - master_DF.to_csv(outfile_mse_csv, index=False) - -if __name__ == "__main__": - main(nsim=50) - + if full: + if n > p: + full_dispersion = True + else: + full_dispersion = False + p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) + avg_length = intervals[:, 1] - intervals[:, 0] + else: + full_dispersion = True + p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) + avg_length = intervals[:, 1] - intervals[:, 0] + + cover.extend(cover_) + P0.extend(p0) + PA.extend(pA) + # print( + # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), + # np.mean(avg_length), 'null pvalue + power + length') + print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) From 40ac8ab9ba4db0a7a87e0c9fe7b278816ec73533 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 12:01:43 -0700 Subject: [PATCH 133/187] delete some comments --- selectinf/randomized/query.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 31909ac00..14740cb79 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -107,8 +107,6 @@ def _setup_sampler(self, offset, opt_linear, observed_subgrad, - # optional dispersion parameter - # for covariance of randomization dispersion=1): A, b = linear_part, offset @@ -161,12 +159,9 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad def _setup_implied_gaussian(self, opt_linear, observed_subgrad, - # optional dispersion parameter - # for covariance of randomization dispersion=1): cov_rand, prec = self.randomizer.cov_prec - prec = prec / dispersion # why do we do this here -- prec is just known if np.asarray(prec).shape in [(), (0,)]: _prod_score_prec_unnorm = self._hessian * prec From b57198500607d4bceba0fe5fcc8425b3e04e5da6 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 13:45:37 -0700 Subject: [PATCH 134/187] suppressing dispersion to _setup_implied_gaussian; putting back in tests for logistic, poisson, cox --- selectinf/randomized/approx_reference.py | 6 +- selectinf/randomized/exact_reference.py | 6 +- selectinf/randomized/posterior_inference.py | 6 +- selectinf/randomized/query.py | 18 +- .../tests/test_selective_MLE_high.py | 591 +++++++++++++++++- 5 files changed, 576 insertions(+), 51 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 552fa177c..8e2b3009e 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -61,9 +61,9 @@ def __init__(self, self.opt_linear = opt_linear self.observed_score = observed_score - self.M1 = query.M1 * dispersion - self.M2 = query.M2 * (dispersion ** 2) - self.M3 = query.M3 * (dispersion ** 2) + self.M1 = query.M1 + self.M2 = query.M2 + self.M3 = query.M3 self.feasible_point = query.observed_opt_state self.cond_mean = query.cond_mean diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index df90eacec..a81894d8b 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -59,9 +59,9 @@ def __init__(self, self.opt_linear = opt_linear self.observed_score = observed_score - self.M1 = query.M1 * dispersion - self.M2 = query.M2 * (dispersion ** 2) - self.M3 = query.M3 * (dispersion ** 2) + self.M1 = query.M1 + self.M2 = query.M2 + self.M3 = query.M3 self.feasible_point = query.observed_opt_state self.cond_mean = query.cond_mean diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 4395e4499..e1faacc54 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -66,9 +66,9 @@ def __init__(self, self.opt_linear = opt_linear self.observed_score = observed_score - self.M1 = query.M1 * dispersion - self.M2 = query.M2 * (dispersion ** 2) - self.M3 = query.M3 * (dispersion ** 2) + self.M1 = query.M1 + self.M2 = query.M2 + self.M3 = query.M3 self.feasible_point = query.observed_opt_state self.cond_mean = query.cond_mean diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 86d6cdca6..26e0a297f 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -120,7 +120,8 @@ def _setup_sampler(self, M1, M2, M3) = self._setup_implied_gaussian(opt_linear, - observed_subgrad) + observed_subgrad, + dispersion=dispersion) def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad if score.ndim == 1: @@ -157,7 +158,8 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad def _setup_implied_gaussian(self, opt_linear, - observed_subgrad): + observed_subgrad, + dispersion=1): cov_rand, prec = self.randomizer.cov_prec @@ -179,9 +181,9 @@ def _setup_implied_gaussian(self, cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - M1 = prod_score_prec_unnorm - M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + M1 = prod_score_prec_unnorm * dispersion + M2 = M1.dot(cov_rand).dot(M1.T) * (dispersion**2) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) * (dispersion**2) self.M1 = M1 self.M2 = M2 @@ -978,9 +980,9 @@ def selective_MLE(self, Arguments passed to solver. """ - self.M1 = self.M1 * dispersion - self.M2 = self.M2 * (dispersion**2) - self.M3 = self.M3 * (dispersion**2) + # self.M1 = self.M1 * dispersion + # self.M2 = self.M2 * (dispersion**2) + # self.M3 = self.M3 * (dispersion**2) return selective_MLE(observed_target, cov_target, diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 0d9ea51ac..3ece533c2 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -187,6 +187,7 @@ def test_instance(): M[-3:] = 1 print("check ", M) dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, cov_target, regress_target_score, @@ -212,16 +213,6 @@ def test_instance(): return coverage -# def main(nsim=500): -# -# cover = [] -# for i in range(nsim): -# -# cover_ = test_instance() -# cover.extend(cover_) -# print(np.mean(cover), 'coverage so far ') - - def test_selected_targets_disperse(n=500, p=100, s=5, @@ -289,30 +280,562 @@ def test_selected_targets_disperse(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], coverage, intervals -def main(nsim=500, full=False): - P0, PA, cover, length_int = [], [], [], [] - from statsmodels.distributions import ECDF +def test_logistic(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on binomial data + """ - n, p, s = 500, 100, 0 + inst, const = logistic_instance, lasso.logistic + signal = np.sqrt(signal_fac * 2 * np.log(p)) - for i in range(nsim): - if full: - if n > p: - full_dispersion = True - else: - full_dispersion = False - p0, pA, cover_, intervals = test_full_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - else: - full_dispersion = True - p0, pA, cover_, intervals = test_selected_targets(n=n, p=p, s=s, full_dispersion=full_dispersion) - avg_length = intervals[:, 1] - intervals[:, 0] - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - # print( - # np.array(PA) < 0.1, np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.1), np.mean(np.array(PA) < 0.1), np.mean(cover), - # np.mean(avg_length), 'null pvalue + power + length') - print("coverage and lengths ", np.mean(cover), np.mean(avg_length)) + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_logistic_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on binomial data with data splitting + """ + + inst, const = logistic_instance, split_lasso.logistic + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_poisson(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on Poisson data + """ + + inst, const = poisson_instance, lasso.poisson + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_poisson_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on Poisson data with data splitting + """ + + inst, const = poisson_instance, split_lasso.poisson + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:3] + + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_cox(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on survival data + """ + + inst, const = cox_instance, lasso.coxph + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, T, S, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:4] + + n, p = X.shape + + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) + + conv = const(X, + T, + S, + W, + randomizer_scale=randomizer_scale) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + cox_full = rr.glm.cox(X, T, S) + full_hess = cox_full.hessian(conv.observed_soln) + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + None, + nonzero, + hessian=full_hess, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_cox_split(n=2000, + p=200, + signal_fac=10., + s=5, + rho=0.4, + randomizer_scale=1): + """ + Run approx MLE with selected targets on survival data with data splitting + """ + + inst, const = cox_instance, split_lasso.coxph + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + while True: + X, T, S, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + random_signs=True)[:4] + + n, p = X.shape + + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) + + conv = const(X, + T, + S, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print("dimensions", n, p, nonzero.sum()) + + if nonzero.sum() > 0: + + cox_full = rr.glm.cox(X, T, S) + full_hess = cox_full.hessian(conv.observed_soln) + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + None, + nonzero, + hessian=full_hess, + dispersion=1) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals + +def test_scale_invariant_split(n=200, + p=20, + signal_fac=10., + s=5, + sigma=3, + rho=0.4, + randomizer_scale=1, + full_dispersion=True, + seed=2): + """ + Confirm Gaussian version is appropriately scale invariant with data splitting + """ + + inst, const = gaussian_instance, split_lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + results = [] + + scales = [1, 5] + for scale in scales: + + np.random.seed(seed) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + Y *= scale; beta *= scale + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + print('W', W[0]/scale) + conv = const(X, + Y, + W, + proportion=0.7) + + signs = conv.fit() + nonzero = signs != 0 + print('nonzero', np.where(nonzero)[0]) + print('feature_weights', conv.feature_weights[0] / scale) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + print('dispersion', dispersion/scale**2) + print('target', observed_target[0]/scale) + print('cov_target', cov_target[0,0]/scale**2) + print('cov_target_score', cov_target_score[0,0]/scale**2) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + print(result['MLE'] / scale) + results.append(result) + + assert np.allclose(results[0]['MLE'] / scales[0], + results[1]['MLE'] / scales[1]) + assert np.allclose(results[0]['SE'] / scales[0], + results[1]['SE'] / scales[1]) + assert np.allclose(results[0]['upper_confidence'] / scales[0], + results[1]['upper_confidence'] / scales[1]) + assert np.allclose(results[0]['lower_confidence'] / scales[0], + results[1]['lower_confidence'] / scales[1]) + assert np.allclose(results[0]['Zvalue'], + results[1]['Zvalue']) + assert np.allclose(results[0]['pvalue'], + results[1]['pvalue']) + +def test_scale_invariant(n=200, + p=20, + signal_fac=10., + s=5, + sigma=3, + rho=0.4, + randomizer_scale=1, + full_dispersion=True, + seed=2): + """ + Confirm Gaussian version is appropriately scale invariant + """ + + inst, const = gaussian_instance, lasso.gaussian + signal = np.sqrt(signal_fac * 2 * np.log(p)) + + results = [] + + scales = [1, 5] + for scale in scales: + + np.random.seed(seed) + X, Y, beta = inst(n=n, + p=p, + signal=signal, + s=s, + equicorrelated=False, + rho=rho, + sigma=sigma, + random_signs=True)[:3] + + Y *= scale; beta *= scale + n, p = X.shape + + sigma_ = np.std(Y) + W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_ + print('W', W[0]/scale) + conv = const(X, + Y, + W, + randomizer_scale=randomizer_scale * sigma_) + signs = conv.fit() + nonzero = signs != 0 + print('nonzero', np.where(nonzero)[0]) + print('feature_weights', conv.feature_weights[0] / scale) + print('perturb', conv._initial_omega[0] / scale) + dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + print('dispersion', dispersion/scale**2) + print('target', observed_target[0]/scale) + print('cov_target', cov_target[0,0]/scale**2) + print('cov_target_score', cov_target_score[0,0]/scale**2) + + result = conv.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + + print(result['MLE'] / scale) + results.append(result) + + assert np.allclose(results[0]['MLE'] / scales[0], + results[1]['MLE'] / scales[1]) + assert np.allclose(results[0]['SE'] / scales[0], + results[1]['SE'] / scales[1]) + assert np.allclose(results[0]['upper_confidence'] / scales[0], + results[1]['upper_confidence'] / scales[1]) + assert np.allclose(results[0]['lower_confidence'] / scales[0], + results[1]['lower_confidence'] / scales[1]) + assert np.allclose(results[0]['Zvalue'], + results[1]['Zvalue']) + assert np.allclose(results[0]['pvalue'], + results[1]['pvalue']) + + +def test_instance(): + n, p, s = 500, 100, 5 + X = np.random.standard_normal((n, p)) + beta = np.zeros(p) + beta[:s] = np.sqrt(2 * np.log(p) / n) + Y = X.dot(beta) + np.random.standard_normal(n) + + scale_ = np.std(Y) + # uses noise of variance n * scale_ / 4 by default + L = lasso.gaussian(X, Y, 3 * scale_ * np.sqrt(2 * np.log(p) * np.sqrt(n))) + signs = L.fit() + E = (signs != 0) + + M = E.copy() + M[-3:] = 1 + print("check ", M) + dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + (observed_target, + cov_target, + cov_target_score, + dispersion, + alternatives) = selected_targets(L.loglike, + L._W, + M, + dispersion=dispersion) + + print("check shapes", observed_target.shape, E.sum()) + + result = L.selective_MLE(observed_target, + cov_target, + cov_target_score)[0] + estimate = result['MLE'] + pval = result['pvalue'] + intervals = np.asarray(result[['lower_confidence', + 'upper_confidence']]) + + beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) + + coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) + print("observed_opt_state ", L.observed_opt_state) + # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) + + return coverage From f580e93dca6a0bdc62d61f93e58fbfc33704bb91 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 13:47:29 -0700 Subject: [PATCH 135/187] removing dispersion from selective mle --- selectinf/randomized/query.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 26e0a297f..500e64d48 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -301,7 +301,6 @@ def selective_MLE(self, observed_target, cov_target, regress_target_score, - dispersion=1, level=0.9, solve_args={'tol': 1.e-12}): """ @@ -323,7 +322,7 @@ def selective_MLE(self, cov_target, regress_target_score, self.observed_opt_state, - dispersion=dispersion, +# dispersion=dispersion, level=level, solve_args=solve_args) @@ -958,7 +957,6 @@ def selective_MLE(self, # used as a feasible point. # precise value used only for independent estimator observed_soln, - dispersion=1, solve_args={'tol': 1.e-12}, level=0.9): """ From e578753dcbdd5f53ee40a8b7a9d099cb75f92208 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 13:53:41 -0700 Subject: [PATCH 136/187] removing dispersion where possible --- selectinf/randomized/approx_reference.py | 4 +- selectinf/randomized/exact_reference.py | 4 +- selectinf/randomized/posterior_inference.py | 5 +- .../randomized/tests/test_approx_reference.py | 51 ------------------- .../randomized/tests/test_exact_reference.py | 3 +- 5 files changed, 5 insertions(+), 62 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 8e2b3009e..5b1e43c19 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -14,7 +14,6 @@ def __init__(self, observed_target, cov_target, regress_target_score, - dispersion=1, solve_args={'tol': 1.e-12}, useIP=False): @@ -47,8 +46,7 @@ def __init__(self, result, inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, - regress_target_score, - dispersion) + regress_target_score) cond_cov = query.cond_cov self.cond_precision = np.linalg.inv(cond_cov) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index a81894d8b..9facaa7fe 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -13,7 +13,6 @@ def __init__(self, observed_target, cov_target, regress_target_score, - dispersion, solve_args={'tol': 1.e-12}): """ @@ -45,8 +44,7 @@ def __init__(self, result, inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, - regress_target_score, - dispersion) + regress_target_score) cond_cov = query.cond_cov self.cond_precision = np.linalg.inv(cond_cov) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index e1faacc54..bbab9bd5d 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -47,8 +47,7 @@ def __init__(self, result, self.inverse_info, log_ref = query.selective_MLE(observed_target, cov_target, - regress_target_score, - dispersion) + regress_target_score) ### Note for an informative prior we might want to change this... @@ -76,7 +75,7 @@ def __init__(self, self.offset = offset self.initial_estimate = np.asarray(result['MLE']) - self.dispersion = dispersion + self.dispersion = dispersion # why is this needed? self.log_ref = log_ref self._set_marginal_parameters() diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 80c6c3776..1b08b2235 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -66,7 +66,6 @@ def test_inf(n=500, observed_target, cov_target, regress_target_score, - dispersion=dispersion, useIP=useIP) if CI is False: @@ -81,54 +80,4 @@ def test_inf(n=500, return np.mean(coverage), np.mean(length) -def main(nsim=300, CI = False): - - import matplotlib as mpl - mpl.use('tkagg') - import matplotlib.pyplot as plt - from statsmodels.distributions.empirical_distribution import ECDF - - if CI is False: - _pivot = [] - for i in range(nsim): - _pivot.extend(test_inf(n=100, - p=400, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.30, - randomizer_scale=1., - equicorrelated=True, - useIP=True, - CI=False)) - - print("iteration completed ", i) - - plt.clf() - ecdf_MLE = ECDF(np.asarray(_pivot)) - grid = np.linspace(0, 1, 101) - plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') - plt.plot(grid, grid, 'k--') - plt.show() - - if CI is True: - coverage_ = 0. - length_ = 0. - for n in range(nsim): - cov, len = test_inf(n=100, - p=400, - signal_fac=0.5, - s=5, - sigma=2., - rho=0.30, - randomizer_scale=1., - equicorrelated=True, - useIP=True, - CI=True) - - coverage_ += cov - length_ += len - print("coverage so far ", coverage_ / (n + 1.)) - print("lengths so far ", length_ / (n + 1.)) - print("iteration completed ", n + 1) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index ce4a41c2d..d8a1e180e 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -65,8 +65,7 @@ def test_inf(n=500, exact_grid_inf = exact_grid_inference(conv, observed_target, cov_target, - regress_target_score, - dispersion=dispersion) + regress_target_score) if CI is False: pivot = exact_grid_inf._pivots(beta_target) From 51918b2d7f09dbb06134de972e914b7e52021314 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 16:17:48 -0700 Subject: [PATCH 137/187] WIP: moving targets to base module; fixing slope and screening --- selectinf/base.py | 213 ++++++++++++++++++ selectinf/randomized/lasso.py | 209 ++--------------- selectinf/randomized/modelQ.py | 2 + selectinf/randomized/query.py | 18 +- selectinf/randomized/screening.py | 18 +- selectinf/randomized/slope.py | 11 +- .../randomized/tests/test_exact_reference.py | 6 +- selectinf/randomized/tests/test_lasso.py | 119 +++++----- .../tests/test_marginal_screening.py | 3 + selectinf/randomized/tests/test_modelQ.py | 2 +- selectinf/randomized/tests/test_posterior.py | 25 +- .../tests/test_selective_MLE_high.py | 88 ++------ selectinf/randomized/tests/test_slope.py | 23 +- selectinf/randomized/tests/test_topK.py | 4 + 14 files changed, 364 insertions(+), 377 deletions(-) diff --git a/selectinf/base.py b/selectinf/base.py index dc6db4230..c6ee4ac46 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -1,6 +1,11 @@ +import numpy as np + import regreg.api as rr import regreg.affine as ra +from .algorithms.debiased_lasso import (debiasing_matrix, + pseudoinverse_debiasing_matrix) + def restricted_estimator(loss, active, solve_args={'min_its':50, 'tol':1.e-10}): """ Fit a restricted model using only columns `active`. @@ -35,3 +40,211 @@ def restricted_estimator(loss, active, solve_args={'min_its':50, 'tol':1.e-10}): beta_E = loss_restricted.solve(**solve_args) return beta_E + + +# functions construct targets of inference +# and covariance with score representation + +def selected_targets(loglike, + solution, + features=None, + sign_info={}, + dispersion=None, + solve_args={'tol': 1.e-12, 'min_its': 100}, + hessian=None): + + if features is None: + features = solution != 0 + + X, y = loglike.data + n, p = X.shape + + observed_target = restricted_estimator(loglike, features, solve_args=solve_args) + linpred = X[:, features].dot(observed_target) + + Hfeat = _compute_hessian(loglike, + solution, + features)[1] + Qfeat = Hfeat[features] + _score_linear = -Hfeat + + cov_target = np.linalg.inv(Qfeat) + crosscov_target_score = _score_linear.dot(cov_target) + alternatives = ['twosided'] * features.sum() + features_idx = np.arange(p)[features] + + for i in range(len(alternatives)): + if features_idx[i] in sign_info.keys(): + alternatives[i] = sign_info[features_idx[i]] + + if dispersion is None: # use Pearson's X^2 + dispersion = _pearsonX2(y, + linpred, + loglike, + observed_target.shape[0]) + + regress_target_score = np.zeros((cov_target.shape[0], p)) + regress_target_score[:,features] = cov_target + return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives + +def full_targets(loglike, + solution, + features=None, + dispersion=None, + solve_args={'tol': 1.e-12, 'min_its': 50}, + hessian=None): + + if features is None: + features = solution != 0 + + X, y = loglike.data + n, p = X.shape + features_bool = np.zeros(p, np.bool) + features_bool[features] = True + features = features_bool + + # target is one-step estimator + + full_estimator = loglike.solve(**solve_args) + linpred = X.dot(full_estimator) + Qfull = _compute_hessian(loglike, + full_estimator) + + Qfull_inv = np.linalg.inv(Qfull) + cov_target = Qfull_inv[features][:, features] + observed_target = full_estimator[features] + crosscov_target_score = np.zeros((p, cov_target.shape[0])) + crosscov_target_score[features] = -np.identity(cov_target.shape[0]) + + if dispersion is None: # use Pearson's X^2 + dispersion = _pearsonX2(y, + linpred, + loglike, + p) + + alternatives = ['twosided'] * features.sum() + regress_target_score = Qfull_inv[features] # weights missing? + return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives + +def debiased_targets(loglike, + solution, + features=None, + sign_info={}, + penalty=None, #required kwarg + dispersion=None, + approximate_inverse='JM', + debiasing_args={}): + + if features is None: + features = solution != 0 + + if penalty is None: + raise ValueError('require penalty for consistent estimator') + + X, y = loglike.data + n, p = X.shape + features_bool = np.zeros(p, np.bool) + features_bool[features] = True + features = features_bool + + # relevant rows of approximate inverse + + linpred = X.dot(solution) + W = loglike.saturated_loss.hessian(linpred) + if approximate_inverse == 'JM': + Qinv_hat = np.atleast_2d(debiasing_matrix(X * np.sqrt(W)[:, None], + np.nonzero(features)[0], + **debiasing_args)) / n + else: + Qinv_hat = np.atleast_2d(pseudoinverse_debiasing_matrix(X * np.sqrt(W)[:, None], + np.nonzero(features)[0], + **debiasing_args)) + + problem = rr.simple_problem(loglike, penalty) + nonrand_soln = problem.solve() + G_nonrand = loglike.smooth_objective(nonrand_soln, 'grad') + + observed_target = nonrand_soln[features] - Qinv_hat.dot(G_nonrand) + + Qfull, Qrelax = _compute_hessian(loglike, + solution, + features) + + if p > n: + M1 = Qinv_hat.dot(X.T) + cov_target = (M1 * W[None, :]).dot(M1.T) + crosscov_target_score = -(M1 * W[None, :]).dot(X).T + else: + Qfull = X.T.dot(W[:, None] * X) + cov_target = Qinv_hat.dot(Qfull.dot(Qinv_hat.T)) + crosscov_target_score = -Qinv_hat.dot(Qfull).T + + if dispersion is None: # use Pearson's X^2 + relaxed_soln = nonrand_soln[features] - np.linalg.inv(Qrelax[features]).dot(G_nonrand[features]) + Xfeat = X[:, features] + linpred = Xfeat.dot(relaxed_soln) + dispersion = _pearsonX2(y, + linpred, + loglike, + features.sum()) + + alternatives = ['twosided'] * features.sum() + return observed_target, cov_target * dispersion, Qinv_hat, dispersion, alternatives + +def form_targets(target, + loglike, + solution, + features, + **kwargs): + _target = {'full':full_targets, + 'selected':selected_targets, + 'debiased':debiased_targets}[target] + return _target(loglike, + solution, + features, + **kwargs) + +def _compute_hessian(loglike, + beta_bar, + *bool_indices): + + X, y = loglike.data + linpred = X.dot(beta_bar) + n = linpred.shape[0] + + if hasattr(loglike.saturated_loss, "hessian"): # a GLM -- all we need is W + W = loglike.saturated_loss.hessian(linpred) + parts = [np.dot(X.T, X[:, bool_idx] * W[:, None]) for bool_idx in bool_indices] + _hessian = np.dot(X.T, X * W[:, None]) # CAREFUL -- this will be big + elif hasattr(loglike.saturated_loss, "hessian_mult"): + parts = [] + for bool_idx in bool_indices: + _right = np.zeros((n, bool_idx.sum())) + for i, j in enumerate(np.nonzero(bool_idx)[0]): + _right[:,i] = loglike.saturated_loss.hessian_mult(linpred, + X[:,j], + case_weights=loglike.saturated_loss.case_weights) + parts.append(X.T.dot(_right)) + _hessian = np.zeros_like(X) + for i in range(X.shape[1]): + _hessian[:,i] = loglike.saturated_loss.hessian_mult(linpred, + X[:,i], + case_weights=loglike.saturated_loss.case_weights) + _hessian = X.T.dot(_hessian) + else: + raise ValueError('saturated_loss has no hessian or hessian_mult method') + + if bool_indices: + return (_hessian,) + tuple(parts) + else: + return _hessian + +def _pearsonX2(y, + linpred, + loglike, + df_fit): + + W = loglike.saturated_loss.hessian(linpred) + n = y.shape[0] + resid = y - loglike.saturated_loss.mean_function(linpred) + return (resid ** 2 / W).sum() / (n - df_fit) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 674f85a62..ce4062033 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -12,9 +12,9 @@ from .query import gaussian_query from .randomization import randomization -from ..base import restricted_estimator -from ..algorithms.debiased_lasso import (debiasing_matrix, - pseudoinverse_debiasing_matrix) +from ..base import (restricted_estimator, + _compute_hessian, + _pearsonX2) #### High dimensional version #### - parametric covariance @@ -143,6 +143,8 @@ def fit(self, self._overall, solve_args=solve_args) + # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator + beta_bar = np.zeros(p) beta_bar[overall] = _beta_unpenalized self._beta_full = beta_bar @@ -156,42 +158,18 @@ def fit(self, # U for unpenalized # -E for inactive + # compute part of hessian + + _hessian, _hessian_active, _hessian_unpen = _compute_hessian(self.loglike, + beta_bar, + active, + unpenalized) + + # fill in pieces of query + opt_linear = np.zeros((p, num_opt_var)) _score_linear_term = np.zeros((p, num_opt_var)) - # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator - - X, y = self.loglike.data - linpred = X.dot(beta_bar) - n = linpred.shape[0] - - if hasattr(self.loglike.saturated_loss, "hessian"): # a GLM -- all we need is W - W = self._W = self.loglike.saturated_loss.hessian(linpred) - _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) - _hessian_unpen = np.dot(X.T, X[:, unpenalized] * W[:, None]) - _hessian = np.dot(X.T, X * W[:, None]) # CAREFUL -- this will be big - elif hasattr(self.loglike.saturated_loss, "hessian_mult"): - active_right = np.zeros((n, active.sum())) - for i, j in enumerate(np.nonzero(active)[0]): - active_right[:,i] = self.loglike.saturated_loss.hessian_mult(linpred, - X[:,j], - case_weights=self.loglike.saturated_loss.case_weights) - unpen_right = np.zeros((n, unpenalized.sum())) - for i, j in enumerate(np.nonzero(unpenalized)[0]): - unpen_right[:,i] = self.loglike.saturated_loss.hessian_mult(linpred, - X[:,j], - case_weights=self.loglike.saturated_loss.case_weights) - _hessian_active = X.T.dot(active_right) - _hessian_unpen = X.T.dot(unpen_right) - _hessian = [] - for i in range(p): - _hessian.append(self.loglike.saturated_loss.hessian_mult(linpred, - X[:,i], - case_weights=self.loglike.saturated_loss.case_weights)) - _hessian = X.T.dot(np.array(_hessian).T) - else: - raise ValueError('saturated_loss has no hessian or hessian_mult method') - _score_linear_term = -np.hstack([_hessian_active, _hessian_unpen]) # set the observed score (data dependent) state @@ -249,7 +227,7 @@ def signed_basis_vector(p, j, s): #### to be fixed -- set the cov_score here without dispersion - self._hessian = _hessian + self._unscaled_cov_score = _hessian ##### @@ -699,151 +677,6 @@ def sqrt_lasso(X, return obj -# private functions - -# functions construct targets of inference -# and covariance with score representation - -def selected_targets(loglike, - W, - features, - sign_info={}, - dispersion=None, - solve_args={'tol': 1.e-12, 'min_its': 100}, - hessian=None): - - X, y = loglike.data - n, p = X.shape - - Xfeat = X[:, features] - if hessian is None: - Qfeat = Xfeat.T.dot(W[:, None] * Xfeat) - _score_linear = -Xfeat.T.dot(W[:, None] * X).T - else: - Qfeat = hessian[features][:,features] - _score_linear = -hessian[features].T - observed_target = restricted_estimator(loglike, features, solve_args=solve_args) - cov_target = np.linalg.inv(Qfeat) - crosscov_target_score = _score_linear.dot(cov_target) - alternatives = ['twosided'] * features.sum() - features_idx = np.arange(p)[features] - - for i in range(len(alternatives)): - if features_idx[i] in sign_info.keys(): - alternatives[i] = sign_info[features_idx[i]] - - if dispersion is None: # use Pearson's X^2 - dispersion = ((y - loglike.saturated_loss.mean_function( - Xfeat.dot(observed_target))) ** 2 / W).sum() / (n - Xfeat.shape[1]) - - regress_target_score = np.zeros((cov_target.shape[0], p)) - regress_target_score[:,features] = cov_target - return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives - -def full_targets(loglike, - W, - features, - dispersion=None, - solve_args={'tol': 1.e-12, 'min_its': 50}, - hessian=None): - - X, y = loglike.data - n, p = X.shape - features_bool = np.zeros(p, np.bool) - features_bool[features] = True - features = features_bool - - # target is one-step estimator - - Qfull = X.T.dot(W[:, None] * X) - if hessian is None: - Qfull = X.T.dot(W[:, None] * X) - else: - Qfull = hessian - - Qfull_inv = np.linalg.inv(Qfull) - full_estimator = loglike.solve(**solve_args) - cov_target = Qfull_inv[features][:, features] - observed_target = full_estimator[features] - crosscov_target_score = np.zeros((p, cov_target.shape[0])) - crosscov_target_score[features] = -np.identity(cov_target.shape[0]) - - if dispersion is None: # use Pearson's X^2 - dispersion = (((y - loglike.saturated_loss.mean_function(X.dot(full_estimator))) ** 2 / W).sum() / - (n - p)) - - alternatives = ['twosided'] * features.sum() - regress_target_score = Qfull_inv[features] # weights missing? - return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives - -def debiased_targets(loglike, - W, - features, - sign_info={}, - penalty=None, #required kwarg - dispersion=None, - approximate_inverse='JM', - debiasing_args={}): - - if penalty is None: - raise ValueError('require penalty for consistent estimator') - - X, y = loglike.data - n, p = X.shape - features_bool = np.zeros(p, np.bool) - features_bool[features] = True - features = features_bool - - # relevant rows of approximate inverse - - - if approximate_inverse == 'JM': - Qinv_hat = np.atleast_2d(debiasing_matrix(X * np.sqrt(W)[:, None], - np.nonzero(features)[0], - **debiasing_args)) / n - else: - Qinv_hat = np.atleast_2d(pseudoinverse_debiasing_matrix(X * np.sqrt(W)[:, None], - np.nonzero(features)[0], - **debiasing_args)) - - problem = rr.simple_problem(loglike, penalty) - nonrand_soln = problem.solve() - G_nonrand = loglike.smooth_objective(nonrand_soln, 'grad') - - observed_target = nonrand_soln[features] - Qinv_hat.dot(G_nonrand) - - if p > n: - M1 = Qinv_hat.dot(X.T) - cov_target = (M1 * W[None, :]).dot(M1.T) - crosscov_target_score = -(M1 * W[None, :]).dot(X).T - else: - Qfull = X.T.dot(W[:, None] * X) - cov_target = Qinv_hat.dot(Qfull.dot(Qinv_hat.T)) - crosscov_target_score = -Qinv_hat.dot(Qfull).T - - if dispersion is None: # use Pearson's X^2 - Xfeat = X[:, features] - Qrelax = Xfeat.T.dot(W[:, None] * Xfeat) - relaxed_soln = nonrand_soln[features] - np.linalg.inv(Qrelax).dot(G_nonrand[features]) - dispersion = (((y - loglike.saturated_loss.mean_function(Xfeat.dot(relaxed_soln)))**2 / W).sum() / - (n - features.sum())) - - alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, Qinv_hat, dispersion, alternatives - -def form_targets(target, - loglike, - W, - features, - **kwargs): - _target = {'full':full_targets, - 'selected':selected_targets, - 'debiased':debiased_targets}[target] - return _target(loglike, - W, - features, - **kwargs) - class split_lasso(lasso): """ @@ -940,15 +773,15 @@ def _setup_implied_gaussian(self, prod_score_prec = np.identity(self.nfeature) / ratio - cov_rand = self._hessian * dispersion + cov_rand = self._unscaled_cov_score * dispersion - M1 = prod_score_prec - M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + M1 = prod_score_prec * dispersion + M2 = M1.dot(cov_rand).dot(M1.T) * (dispersion**2) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) * (dispersion**2) # would be nice to not store these? - self.M1 = M1 + self.M1 = M1 self.M2 = M2 self.M3 = M3 @@ -1235,3 +1068,5 @@ def poisson(X, return split_lasso(loglike, np.asarray(feature_weights), proportion) + + diff --git a/selectinf/randomized/modelQ.py b/selectinf/randomized/modelQ.py index 62aa37b47..c239f8821 100644 --- a/selectinf/randomized/modelQ.py +++ b/selectinf/randomized/modelQ.py @@ -177,6 +177,8 @@ def fit(self, _hessian_active = self.Q[:, active] _hessian_unpen = self.Q[:, unpenalized] + self._unscaled_cov_score = self.Q + _score_linear_term = -np.hstack([_hessian_active, _hessian_unpen]) # set the observed score (data dependent) state diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 500e64d48..e7a401c01 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -12,10 +12,14 @@ from ..distributions.api import discrete_family from ..constraints.affine import (sample_from_constraints, constraints) +from ..algorithms.barrier_affine import solve_barrier_affine_py +from ..base import (selected_targets, + full_targets, + debiased_targets) + from .posterior_inference import posterior from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from .approx_reference import approximate_grid_inference -from ..algorithms.barrier_affine import solve_barrier_affine_py class query(object): r""" @@ -164,9 +168,9 @@ def _setup_implied_gaussian(self, cov_rand, prec = self.randomizer.cov_prec if np.asarray(prec).shape in [(), (0,)]: - prod_score_prec_unnorm = self._hessian * prec + prod_score_prec_unnorm = self._unscaled_cov_score * prec else: - prod_score_prec_unnorm = self._hessian.dot(prec) + prod_score_prec_unnorm = self._unscaled_cov_score.dot(prec) if np.asarray(prec).shape in [(), (0,)]: cond_precision = opt_linear.T.dot(opt_linear) * prec @@ -201,7 +205,6 @@ def summary(self, observed_target, cov_target, regress_target_score, - dispersion, alternatives, opt_sample=None, target_sample=None, @@ -234,8 +237,6 @@ def summary(self, Defaults to 1000. compute_intervals : bool Compute confidence intervals? - dispersion : float (optional) - Use a known value for dispersion, or Pearson's X^2? """ if parameter is None: @@ -322,7 +323,6 @@ def selective_MLE(self, cov_target, regress_target_score, self.observed_opt_state, -# dispersion=dispersion, level=level, solve_args=solve_args) @@ -373,7 +373,6 @@ def approximate_grid_inference(self, cov_target, regress_target_score, alternatives=None, - dispersion=1, solve_args={'tol': 1.e-12}, useIP=False): @@ -398,7 +397,6 @@ def approximate_grid_inference(self, cov_target, regress_target_score, solve_args=solve_args, - dispersion=dispersion, useIP=useIP) return G.summary(alternatives=alternatives) @@ -1460,5 +1458,3 @@ def selective_MLE(observed_target, return result, observed_info_mean, log_ref - - diff --git a/selectinf/randomized/screening.py b/selectinf/randomized/screening.py index db6602cc4..3c5df5cd6 100644 --- a/selectinf/randomized/screening.py +++ b/selectinf/randomized/screening.py @@ -21,6 +21,7 @@ def __init__(self, self.covariance = covariance self.randomizer = randomizer self._initial_omega = perturb + self._unscaled_cov_score = covariance def fit(self, perturb=None): @@ -28,7 +29,9 @@ def fit(self, perturb=None): self._randomized_score = self.observed_score_state - self._initial_omega return self._randomized_score, self._randomized_score.shape[0] - def multivariate_targets(self, features, dispersion=1.): + def multivariate_targets(self, + features, + dispersion=1): """ Entries of the mean of \Sigma[E,E]^{-1}Z_E """ @@ -42,9 +45,12 @@ def multivariate_targets(self, features, dispersion=1.): return (observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, + dispersion, alternatives) - def full_targets(self, features, dispersion=1.): + def full_targets(self, + features, + dispersion=1): """ Entries of the mean of \Sigma[E,E]^{-1}Z_E """ @@ -55,9 +61,11 @@ def full_targets(self, features, dispersion=1.): crosscov_target_score = -np.identity(Q.shape[0])[:, features] alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, alternatives + return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, dispersion, alternatives - def marginal_targets(self, features): + def marginal_targets(self, + features, + dispersion=1): """ Entries of the mean of Z_E """ @@ -68,7 +76,7 @@ def marginal_targets(self, features): crosscov_target_score = -score_linear alternatives = ['twosided'] * features.sum() - return observed_target, cov_target, crosscov_target_score.T, alternatives + return observed_target, cov_target, crosscov_target_score.T, dispersion, alternatives class marginal_screening(screening): diff --git a/selectinf/randomized/slope.py b/selectinf/randomized/slope.py index 5f88676e8..b7ede0954 100644 --- a/selectinf/randomized/slope.py +++ b/selectinf/randomized/slope.py @@ -20,7 +20,7 @@ from ..constraints.affine import constraints from .randomization import randomization -from ..base import restricted_estimator +from ..base import restricted_estimator, _compute_hessian from .query import gaussian_query from .lasso import lasso @@ -121,9 +121,11 @@ def fit(self, self.num_opt_var = self.observed_opt_state.shape[0] - X, y = self.loglike.data - W = self._W = self.loglike.saturated_loss.hessian(X.dot(beta_bar)) - _hessian_active = np.dot(X.T, X[:, active] * W[:, None]) + self._unscaled_cov_score, _hessian_active = _compute_hessian(self.loglike, + beta_bar, + active) + + _score_linear_term = -_hessian_active self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) @@ -152,6 +154,7 @@ def fit(self, if signs_cluster.size == 0: return active_signs else: + X, y = self.loglike.data X_clustered = X[:, indices].dot(signs_cluster) _opt_linear_term = X.T.dot(X_clustered) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index d8a1e180e..6e7e73f6d 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -1,7 +1,8 @@ import numpy as np from ...tests.instance import gaussian_instance -from ..lasso import lasso, selected_targets +from ..lasso import lasso +from ...base import selected_targets from ..exact_reference import exact_grid_inference def test_inf(n=500, @@ -58,8 +59,7 @@ def test_inf(n=500, regress_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) exact_grid_inf = exact_grid_inference(conv, diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 3a16411ec..1ba443c29 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -5,14 +5,18 @@ import regreg.api as rr -from ..lasso import lasso, selected_targets, full_targets, debiased_targets +from ..lasso import lasso +from ...base import selected_targets, full_targets, debiased_targets from ...tests.instance import gaussian_instance, logistic_instance -from ...tests.flags import SET_SEED +from ...tests.flags import SET_SEED, SMALL_SAMPLES from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso from ..randomization import randomization from ...tests.decorators import rpy_test_safe + +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=10) def test_highdim_lasso(n=500, p=200, signal_fac=1.5, @@ -59,23 +63,23 @@ def test_highdim_lasso(n=500, (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = full_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) elif target == 'selected': (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) elif target == 'debiased': (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = debiased_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, penalty=conv.penalty) result = conv.summary(observed_target, @@ -89,6 +93,8 @@ def test_highdim_lasso(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=10) def test_AR_randomization(n=300, p=100, signal=4.5, @@ -147,23 +153,23 @@ def test_AR_randomization(n=300, (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = full_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) elif target == 'selected': (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) elif target == 'debiased': (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = debiased_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, penalty=conv.penalty) result = conv.summary(observed_target, @@ -177,10 +183,29 @@ def test_AR_randomization(n=300, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] -def test_all_targets(n=100, p=20, signal_fac=1.5, s=5, sigma=3, rho=0.4): +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=10) +def test_all_targets(n=100, + p=20, + signal_fac=1.5, + s=5, + sigma=3, + rho=0.4, + ndraw=5000, + burnin=1000): for target in ['full', 'selected', 'debiased']: - test_highdim_lasso(n=n, p=p, signal_fac=signal_fac, s=s, sigma=sigma, rho=rho, target=target) + test_highdim_lasso(n=n, + p=p, + signal_fac=signal_fac, + s=s, + sigma=sigma, + rho=rho, + target=target, + ndraw=ndraw, + burnin=burnin) +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=10) def test_sqrt_highdim_lasso(n=500, p=200, signal_fac=1.5, @@ -231,7 +256,7 @@ def test_sqrt_highdim_lasso(n=500, q_term = rr.identity_quadratic(ridge_term, 0, -perturb, 0) soln2, sqrt_loss = solve_sqrt_lasso(X, Y, W, solve_args={'min_its':1000}, quadratic=q_term, force_fat=True) - soln = conv.initial_soln + soln = conv.observed_soln denom = np.linalg.norm(Y - X.dot(soln)) new_weights = W * denom @@ -253,16 +278,16 @@ def test_sqrt_highdim_lasso(n=500, (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = full_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) else: (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) result = conv.summary(observed_target, cov_target, @@ -275,6 +300,7 @@ def test_sqrt_highdim_lasso(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] +@np.testing.dec.skipif(True, "comparison to R is broken") @set_seed_iftrue(SET_SEED) @rpy_test_safe(libraries=['selectiveInference']) def test_compareR(n=200, @@ -348,13 +374,15 @@ def Rpval(X, Y, W, noise_scale=None): assert np.fabs(conv.ridge_term - ridge_term) / ridge_term < 1.e-4 - assert np.fabs(soln - conv.initial_soln).max() / np.fabs(soln).max() < 1.e-3 + assert np.fabs(soln - conv.observed_soln).max() / np.fabs(soln).max() < 1.e-3 nonzero = signs != 0 assert np.linalg.norm(conv.sampler.affine_con.covariance - cond_cov) / np.linalg.norm(cond_cov) < 1.e-3 assert np.linalg.norm(conv.sampler.affine_con.mean - cond_mean[:,0]) / np.linalg.norm(cond_mean[:,0]) < 1.e-3 +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=10) def test_logistic_lasso(n=500, p=200, signal_fac=1.5, @@ -402,17 +430,16 @@ def test_logistic_lasso(n=500, (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = full_targets(conv.loglike, - conv._W, - nonzero) + conv.observed_soln) else: (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero) - + conv.observed_soln) result = conv.summary(observed_target, cov_target, cov_target_score, @@ -425,41 +452,3 @@ def test_logistic_lasso(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] -def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True): - - import matplotlib.pyplot as plt - P0, PA = [], [] - from statsmodels.distributions import ECDF - - for i in range(nsim): - if True: - if not sqrt: - if AR: - p0, pA = test_AR_randomization(n=n, p=p, target=target, sigma=sigma) - else: - p0, pA = test_highdim_lasso(n=n, p=p, target=target, sigma=sigma) - else: - p0, pA = test_sqrt_highdim_lasso(n=n, p=p, target=target, compare_to_lasso=False) - else: - p0, pA = [], [] - print(len(p0), len(pA)) - P0.extend(p0) - PA.extend(pA) - - P0_clean = np.array(P0) - - P0_clean = P0_clean[P0_clean > 1.e-5] # - print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.sum(np.array(PA) < 0.05) / (i+1), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure') - - if i % 3 == 0 and i > 0: - U = np.linspace(0, 1, 101) - plt.clf() - if len(P0_clean) > 0: - plt.plot(U, ECDF(P0_clean)(U)) - if len(PA) > 0: - plt.plot(U, ECDF(PA)(U), 'r') - plt.plot([0, 1], [0, 1], 'k--') - plt.savefig("plot.pdf") - plt.show() - - diff --git a/selectinf/randomized/tests/test_marginal_screening.py b/selectinf/randomized/tests/test_marginal_screening.py index 6db0fbdf2..e8dac39aa 100644 --- a/selectinf/randomized/tests/test_marginal_screening.py +++ b/selectinf/randomized/tests/test_marginal_screening.py @@ -49,11 +49,13 @@ def test_marginal(n=500, (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = marginal_select.marginal_targets(nonzero) else: (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = marginal_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: @@ -137,6 +139,7 @@ def test_simple(n=100, (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = marginal_select.marginal_targets(nonzero) if use_MLE: diff --git a/selectinf/randomized/tests/test_modelQ.py b/selectinf/randomized/tests/test_modelQ.py index e88522423..09d70d29c 100644 --- a/selectinf/randomized/tests/test_modelQ.py +++ b/selectinf/randomized/tests/test_modelQ.py @@ -29,7 +29,7 @@ def test_modelQ(): conH = LH.sampler.affine_con conQ = LQ.sampler.affine_con - np.testing.assert_allclose(LH.initial_soln, LQ.initial_soln) + np.testing.assert_allclose(LH.observed_soln, LQ.observed_soln) np.testing.assert_allclose(LH.initial_subgrad, LQ.initial_subgrad) np.testing.assert_allclose(conH.linear_part, conQ.linear_part) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index fbfbbb5ce..66780a5b4 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -2,10 +2,11 @@ import pandas as pd from scipy.stats import norm as ndist -from ..lasso import lasso, selected_targets, split_lasso +from ..lasso import lasso, split_lasso from ..posterior_inference import (langevin_sampler, gibbs_sampler) +from ...base import selected_targets from ...tests.instance import gaussian_instance, HIV_NRTI from ...tests.flags import SET_SEED, SMALL_SAMPLES from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue @@ -57,8 +58,7 @@ def test_Langevin(n=500, regress_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) posterior_inf = conv.posterior(observed_target, @@ -127,9 +127,9 @@ def test_instance(nsample=100, nburnin=50): regress_target_score, dispersion, alternatives)= selected_targets(L.loglike, - L._W, - M, - dispersion=dispersion) + L.observed_soln, + features=M, + dispersion=dispersion) posterior_inf = L.posterior(observed_target, cov_target, @@ -183,8 +183,8 @@ def test_flexible_prior1(nsample=100, regress_target_score, dispersion, alternatives) = selected_targets(L.loglike, - L._W, - M, + L.observed_soln, + features=M, dispersion=dispersion) # default prior @@ -253,8 +253,8 @@ def test_flexible_prior2(nsample=1000, nburnin=50): regress_target_score, dispersion, alternatives) = selected_targets(L.loglike, - L._W, - M, + L.observed_soln, + features=M, dispersion=dispersion) prior_var = 0.05 ** 2 @@ -318,21 +318,18 @@ def test_hiv_data(nsample=10000, regress_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) mle, inverse_info = conv.selective_MLE(observed_target, cov_target, regress_target_score, - dispersion, level=level, solve_args={'tol': 1.e-12})[:2] approx_inf = conv.approximate_grid_inference(observed_target, cov_target, regress_target_score, - dispersion=dispersion, useIP=False) posterior_inf = conv.posterior(observed_target, diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 3ece533c2..818cdc012 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -5,8 +5,9 @@ from ..lasso import (lasso, - split_lasso, - full_targets, + split_lasso) + +from ...base import (full_targets, selected_targets, debiased_targets) from ...tests.instance import (gaussian_instance, @@ -69,7 +70,7 @@ def test_full_targets(n=200, regress_target_score, dispersion, alternatives) = full_targets(conv.loglike, - conv._W, + conv.observed_soln, nonzero, dispersion=dispersion) else: @@ -78,7 +79,7 @@ def test_full_targets(n=200, regress_target_score, dispersion, alternatives) = debiased_targets(conv.loglike, - conv._W, + conv.observed_soln, nonzero, penalty=conv.penalty, dispersion=dispersion) @@ -151,8 +152,7 @@ def test_selected_targets(n=2000, regress_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) result = conv.selective_MLE(observed_target, @@ -193,8 +193,8 @@ def test_instance(): regress_target_score, dispersion, alternatives) = selected_targets(L.loglike, - L._W, - M, + L.observed_soln, + features=M, dispersion=dispersion) print("check shapes", observed_target.shape, E.sum()) @@ -261,8 +261,7 @@ def test_selected_targets_disperse(n=500, regress_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) result = conv.selective_MLE(observed_target, @@ -323,8 +322,7 @@ def test_logistic(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -380,8 +378,7 @@ def test_logistic_split(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -437,8 +434,7 @@ def test_poisson(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -494,8 +490,7 @@ def test_poisson_split(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -554,9 +549,7 @@ def test_cox(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - None, - nonzero, - hessian=full_hess, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -615,9 +608,7 @@ def test_cox_split(n=2000, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - None, - nonzero, - hessian=full_hess, + conv.observed_soln, dispersion=1) result = conv.selective_MLE(observed_target, @@ -683,8 +674,7 @@ def test_scale_invariant_split(n=200, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) print('dispersion', dispersion/scale**2) @@ -766,8 +756,7 @@ def test_scale_invariant(n=200, cov_target_score, dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, + conv.observed_soln, dispersion=dispersion) print('dispersion', dispersion/scale**2) @@ -796,46 +785,3 @@ def test_scale_invariant(n=200, results[1]['pvalue']) -def test_instance(): - n, p, s = 500, 100, 5 - X = np.random.standard_normal((n, p)) - beta = np.zeros(p) - beta[:s] = np.sqrt(2 * np.log(p) / n) - Y = X.dot(beta) + np.random.standard_normal(n) - - scale_ = np.std(Y) - # uses noise of variance n * scale_ / 4 by default - L = lasso.gaussian(X, Y, 3 * scale_ * np.sqrt(2 * np.log(p) * np.sqrt(n))) - signs = L.fit() - E = (signs != 0) - - M = E.copy() - M[-3:] = 1 - print("check ", M) - dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(L.loglike, - L._W, - M, - dispersion=dispersion) - - print("check shapes", observed_target.shape, E.sum()) - - result = L.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] - estimate = result['MLE'] - pval = result['pvalue'] - intervals = np.asarray(result[['lower_confidence', - 'upper_confidence']]) - - beta_target = np.linalg.pinv(X[:, M]).dot(X.dot(beta)) - - coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) - print("observed_opt_state ", L.observed_opt_state) - # print("check ", np.asarray(result['MLE']), np.asarray(result['unbiased'])) - - return coverage diff --git a/selectinf/randomized/tests/test_slope.py b/selectinf/randomized/tests/test_slope.py index 66a89ac19..65cc553c7 100644 --- a/selectinf/randomized/tests/test_slope.py +++ b/selectinf/randomized/tests/test_slope.py @@ -6,7 +6,7 @@ import regreg.api as rr from ..slope import slope -from ..lasso import full_targets, selected_targets +from ...base import full_targets, selected_targets from ...tests.decorators import rpy_test_safe try: @@ -155,16 +155,18 @@ def test_randomized_slope(n=2000, (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = full_targets(conv.loglike, - conv._W, - nonzero, dispersion=sigma_) + conv.observed_soln, + dispersion=sigma_) elif target == 'selected': (observed_target, cov_target, cov_target_score, + dispersion, alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, dispersion=sigma_) + conv.observed_soln, + dispersion=sigma_) if target == "selected": beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -196,17 +198,6 @@ def test_randomized_slope(n=2000, if True: return pval[beta_target == 0], pval[beta_target != 0], coverage, lower, upper -def main(nsim=100, use_MLE=True): - - P0, PA, cover, length_int = [], [], [], [] - - for i in range(nsim): - p0, pA, cover_, _, _ = test_randomized_slope(use_MLE=use_MLE) - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - print('coverage', np.mean(cover)) diff --git a/selectinf/randomized/tests/test_topK.py b/selectinf/randomized/tests/test_topK.py index 8091f8ac3..45dbb54b9 100644 --- a/selectinf/randomized/tests/test_topK.py +++ b/selectinf/randomized/tests/test_topK.py @@ -49,11 +49,13 @@ def test_topK(n=500, (observed_target, cov_target, crosscov_target_score, + dipsersion, alternatives) = topK_select.marginal_targets(nonzero) else: (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: @@ -129,12 +131,14 @@ def test_bias_topK(n=500, (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = topK_select.marginal_targets(nonzero) else: beta_target = beta[nonzero] (observed_target, cov_target, crosscov_target_score, + dispersion, alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) result = topK_select.selective_MLE(observed_target, From 048474b5e155409794f4c1808d6c643e0c37b0c0 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 16:24:43 -0700 Subject: [PATCH 138/187] fix target calls in multiple queries --- .../randomized/tests/test_multiple_queries.py | 49 +++++-------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/selectinf/randomized/tests/test_multiple_queries.py b/selectinf/randomized/tests/test_multiple_queries.py index 38c069f9e..267b7e53b 100644 --- a/selectinf/randomized/tests/test_multiple_queries.py +++ b/selectinf/randomized/tests/test_multiple_queries.py @@ -5,12 +5,14 @@ import regreg.api as rr -from ..lasso import lasso, selected_targets, full_targets, debiased_targets -from ..screening import marginal_screening -from ..query import multiple_queries +from ...base import selected_targets from ...tests.instance import gaussian_instance from ...algorithms.sqrt_lasso import choose_lambda, solve_sqrt_lasso +from ..lasso import lasso +from ..screening import marginal_screening +from ..query import multiple_queries + # the test here is marginal_screening + lasso def test_multiple_queries(n=500, p=100, @@ -60,14 +62,19 @@ def test_multiple_queries(n=500, if nonzero.sum() == 0: return [], [] - observed_target1, cov_target1, cov_target_score1, alternatives1 = conv1.multivariate_targets(nonzero, sigma**2) + (observed_target1, + cov_target1, + cov_target_score1, + dispersion1, + alternatives1) = conv1.multivariate_targets(nonzero, sigma**2) (observed_target2, cov_target2, cov_target_score2, + dispersion2, alternatives2) = selected_targets(conv2.loglike, - conv2._W, - nonzero) + conv2.observed_soln, + features=nonzero) mq = multiple_queries([conv1, conv2]) @@ -79,33 +86,3 @@ def test_multiple_queries(n=500, return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] -def main(nsim=500, n=500, p=100, sigma=3): - - P0, PA = [], [] - from statsmodels.distributions import ECDF - import matplotlib.pyplot as plt - - for i in range(nsim): - if True: - p0, pA = test_multiple_queries(n=n, p=p, sigma=sigma) - else: - p0, pA = [], [] - P0.extend(p0) - PA.extend(pA) - - P0_clean = np.array(P0) - - P0_clean = P0_clean[P0_clean > 1.e-5] # - print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5)) - - if i % 3 == 0 and i > 0: - U = np.linspace(0, 1, 101) - plt.clf() - if len(P0_clean) > 0: - plt.plot(U, ECDF(P0_clean)(U)) - if len(PA) > 0: - plt.plot(U, ECDF(PA)(U), 'r') - plt.plot([0, 1], [0, 1], 'k--') - plt.savefig("plot.pdf") - plt.show() - From 3d3f7784342394962632ad7d3ec1ae7739f642f6 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 18 Aug 2021 16:57:36 -0700 Subject: [PATCH 139/187] BF: fixing handling of dispersion --- selectinf/randomized/lasso.py | 4 ++-- selectinf/randomized/query.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index ce4062033..6beb26dc0 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -776,8 +776,8 @@ def _setup_implied_gaussian(self, cov_rand = self._unscaled_cov_score * dispersion M1 = prod_score_prec * dispersion - M2 = M1.dot(cov_rand).dot(M1.T) * (dispersion**2) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) * (dispersion**2) + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) # would be nice to not store these? diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index e7a401c01..a20e0240c 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -186,8 +186,8 @@ def _setup_implied_gaussian(self, cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) M1 = prod_score_prec_unnorm * dispersion - M2 = M1.dot(cov_rand).dot(M1.T) * (dispersion**2) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) * (dispersion**2) + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) self.M1 = M1 self.M2 = M2 From c8eca0a2f98b5a8c8db9d68944c25d8e9e9f2edf Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 24 Aug 2021 10:28:40 -0700 Subject: [PATCH 140/187] using NamedTuple for target specification as this arg appears over and over --- selectinf/base.py | 31 ++- selectinf/randomized/approx_reference.py | 12 +- selectinf/randomized/drop_losers.py | 41 ++-- selectinf/randomized/exact_reference.py | 12 +- selectinf/randomized/posterior_inference.py | 22 +- selectinf/randomized/query.py | 105 ++++---- selectinf/randomized/screening.py | 51 ++-- selectinf/randomized/tests/test_BH.py | 59 +---- .../randomized/tests/test_approx_reference.py | 22 +- .../randomized/tests/test_drop_losers.py | 49 ++-- .../randomized/tests/test_exact_reference.py | 14 +- selectinf/randomized/tests/test_hiv_data.py | 117 +++++++++ selectinf/randomized/tests/test_lasso.py | 106 +++----- .../tests/test_marginal_screening.py | 60 +---- .../randomized/tests/test_multiple_queries.py | 22 +- selectinf/randomized/tests/test_posterior.py | 193 ++------------- .../tests/test_selective_MLE_high.py | 227 ++++++------------ .../tests/test_selective_MLE_onedim.py | 30 +-- selectinf/randomized/tests/test_slope.py | 29 +-- .../randomized/tests/test_split_lasso.py | 79 ++---- .../tests/test_standalone_lasso_mle.py | 49 ++-- selectinf/randomized/tests/test_topK.py | 79 +----- .../tests/test_unbiased_estimates.py | 30 +-- 23 files changed, 521 insertions(+), 918 deletions(-) create mode 100644 selectinf/randomized/tests/test_hiv_data.py diff --git a/selectinf/base.py b/selectinf/base.py index c6ee4ac46..b6fbc182a 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -1,3 +1,5 @@ +import typing + import numpy as np import regreg.api as rr @@ -45,6 +47,14 @@ def restricted_estimator(loss, active, solve_args={'min_its':50, 'tol':1.e-10}): # functions construct targets of inference # and covariance with score representation +class TargetSpec(typing.NamedTuple): + + observed_target : np.ndarray + cov_target : np.ndarray + regress_target_score : np.ndarray + alternatives : list + dispersion : float = 1 + def selected_targets(loglike, solution, features=None, @@ -85,7 +95,12 @@ def selected_targets(loglike, regress_target_score = np.zeros((cov_target.shape[0], p)) regress_target_score[:,features] = cov_target - return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives + + return TargetSpec(observed_target, + cov_target * dispersion, + regress_target_score, + alternatives, + dispersion) def full_targets(loglike, solution, @@ -124,7 +139,12 @@ def full_targets(loglike, alternatives = ['twosided'] * features.sum() regress_target_score = Qfull_inv[features] # weights missing? - return observed_target, cov_target * dispersion, regress_target_score, dispersion, alternatives + + return TargetSpec(observed_target, + cov_target * dispersion, + regress_target_score, + alternatives, + dispersion) def debiased_targets(loglike, solution, @@ -189,7 +209,12 @@ def debiased_targets(loglike, features.sum()) alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, Qinv_hat, dispersion, alternatives + + return TargetSpec(observed_target, + cov_target * dispersion, + Qinv_hat, + alternatives, + dispersion) def form_targets(target, loglike, diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 5b1e43c19..7d10c4ef1 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -11,9 +11,7 @@ class approximate_grid_inference(object): def __init__(self, query, - observed_target, - cov_target, - regress_target_score, + target_spec, solve_args={'tol': 1.e-12}, useIP=False): @@ -35,6 +33,10 @@ def __init__(self, Arguments passed to solver. """ + (observed_target, + cov_target, + regress_target_score) = target_spec[:3] + self.solve_args = solve_args linear_part = query.sampler.affine_con.linear_part @@ -44,9 +46,7 @@ def __init__(self, observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query.selective_MLE(observed_target, - cov_target, - regress_target_score) + result, inverse_info, log_ref = query.selective_MLE(target_spec) cond_cov = query.cond_cov self.cond_precision = np.linalg.inv(cond_cov) diff --git a/selectinf/randomized/drop_losers.py b/selectinf/randomized/drop_losers.py index ac3134144..6c5d45cb3 100644 --- a/selectinf/randomized/drop_losers.py +++ b/selectinf/randomized/drop_losers.py @@ -6,6 +6,7 @@ from .query import gaussian_query from .randomization import randomization +from ..base import TargetSpec class drop_losers(gaussian_query): @@ -41,7 +42,7 @@ def __init__(self, A = -np.identity(K) b = -np.ones(K) * best_loser linear = np.identity(K) - offset = np.zeros(K) + observed_subgrad = np.zeros(K) # Work out the implied randomization variance # Let X1=X[stage1].mean(), X2=X[stage2].mean() and Xf = X.mean() @@ -60,11 +61,12 @@ def __init__(self, # needed for gaussian_query api self.randomizer = randomization.gaussian(np.diag(std_win**2) * mult) - self.observed_opt_state = stage1_means['data'].iloc[:K] - self.observed_score_state = -self.means[self._winners] # problem is a minimization + self.observed_opt_state = np.asarray(stage1_means['data'].iloc[:K]) + self.observed_score_state = -np.asarray(self.means[self._winners]) # problem is a minimization self.selection_variable = {'winners':self._winners} - self._setup_sampler(A, b, linear, offset) + self._unscaled_cov_score = np.diag(std_win**2) * (1/n1_win + 1/n2_win) + self._setup_sampler(A, b, linear, observed_subgrad) def MLE_inference(self, level=0.9, @@ -82,15 +84,19 @@ def MLE_inference(self, """ - observed_target = self.means[self._winners] - std_win = self.std.loc[self._winners] + observed_target = np.asarray(self.means[self._winners]) + std_win = np.asarray(self.std.loc[self._winners]) cov_target = np.diag(std_win**2 / (self._n1_win + self._n2_win)) - cov_target_score = -cov_target + regress_target_score = -np.identity(observed_target.shape[0]) + + target_spec = TargetSpec(observed_target, + cov_target, + regress_target_score, + dispersion=1, + alternatives=['greater']*observed_target.shape[0]) result = gaussian_query.selective_MLE(self, - observed_target, - cov_target, - cov_target_score, + target_spec, level=level, solve_args=solve_args) result[0].insert(0, 'arm', self._winners) @@ -118,16 +124,19 @@ def summary(self, Defaults to 1000. """ - observed_target = self.means[self._winners] + observed_target = np.asarray(self.means[self._winners]) std_win = self.std.loc[self._winners] cov_target = np.diag(std_win**2 / (self._n1_win + self._n2_win)) - cov_target_score = -cov_target + regress_target_score = -np.identity(observed_target.shape[0]) + + target_spec = TargetSpec(observed_target, + cov_target, + regress_target_score, + dispersion=1, + alternatives=['greater']*observed_target.shape[0]) result = gaussian_query.summary(self, - observed_target, - cov_target, - cov_target_score, - alternatives=['twosided']*self.K, + target_spec, ndraw=ndraw, level=level, burnin=burnin, diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 9facaa7fe..13fdbd4a6 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -10,9 +10,7 @@ class exact_grid_inference(object): def __init__(self, query, - observed_target, - cov_target, - regress_target_score, + target_spec, solve_args={'tol': 1.e-12}): """ @@ -33,6 +31,10 @@ def __init__(self, Arguments passed to solver. """ + (observed_target, + cov_target, + regress_target_score) = target_spec[:3] + self.solve_args = solve_args linear_part = query.sampler.affine_con.linear_part @@ -42,9 +44,7 @@ def __init__(self, observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query.selective_MLE(observed_target, - cov_target, - regress_target_score) + result, inverse_info, log_ref = query.selective_MLE(target_spec) cond_cov = query.cond_cov self.cond_precision = np.linalg.inv(cond_cov) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index bbab9bd5d..4284f5211 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -29,15 +29,18 @@ class posterior(object): def __init__(self, query, - observed_target, - cov_target, - regress_target_score, - dispersion, + target_spec, prior, solve_args={'tol': 1.e-12}): self.solve_args = solve_args + (observed_target, + cov_target, + regress_target_score, + _, + dispersion) = target_spec + linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset @@ -45,9 +48,7 @@ def __init__(self, observed_score = query.observed_score_state + query.observed_subgrad - result, self.inverse_info, log_ref = query.selective_MLE(observed_target, - cov_target, - regress_target_score) + result, self.inverse_info, log_ref = query.selective_MLE(target_spec) ### Note for an informative prior we might want to change this... @@ -217,6 +218,13 @@ def gibbs_sampler(selective_posterior, sample = sampler.__next__() samples[i, :] = sample + import sys + sys.stderr.write('a: ' + str(0.1 + + selective_posterior.ntarget + + selective_posterior.ntarget / 2)+'\n') + sys.stderr.write('scale: ' + str(0.1 - ((scale_update ** 2) * sampler.posterior_[0])) + '\n') + sys.stderr.write('scale_update: ' + str(scale_update) + '\n') + sys.stderr.write('initpoint: ' + str(sampler.posterior_[0]) + '\n') scale_update_sq = invgamma.rvs(a=(0.1 + selective_posterior.ntarget + selective_posterior.ntarget / 2), diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index a20e0240c..b2cd82373 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -202,10 +202,7 @@ def _setup_implied_gaussian(self, M3) def summary(self, - observed_target, - cov_target, - regress_target_score, - alternatives, + target_spec, opt_sample=None, target_sample=None, parameter=None, @@ -240,7 +237,7 @@ def summary(self, """ if parameter is None: - parameter = np.zeros_like(observed_target) + parameter = np.zeros_like(target_spec.observed_target) if opt_sample is None: opt_sample, logW = self.sampler.sample(ndraw, burnin) @@ -252,38 +249,36 @@ def summary(self, opt_sample, logW = opt_sample ndraw = opt_sample.shape[0] - pivots = self.sampler.coefficient_pvalues(observed_target, - cov_target, - regress_target_score, + pivots = self.sampler.coefficient_pvalues(target_spec.observed_target, + target_spec.cov_target, + target_spec.regress_target_score, parameter=parameter, sample=(opt_sample, logW), normal_sample=target_sample, - alternatives=alternatives) + alternatives=target_spec.alternatives) if not np.all(parameter == 0): - pvalues = self.sampler.coefficient_pvalues(observed_target, - cov_target, - regress_target_score, + pvalues = self.sampler.coefficient_pvalues(target_spec.observed_target, + target_spec.cov_target, + target_spec.regress_target_score, parameter=np.zeros_like(parameter), sample=(opt_sample, logW), normal_sample=target_sample, - alternatives=alternatives) + alternatives=target_spec.alternatives) else: pvalues = pivots - result = pd.DataFrame({'target': observed_target, + result = pd.DataFrame({'target': target_spec.observed_target, 'pvalue': pvalues}) if compute_intervals: - MLE = self.selective_MLE(observed_target, - cov_target, - regress_target_score)[0] + MLE = self.selective_MLE(target_spec)[0] MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) intervals = self.sampler.confidence_intervals( - observed_target, - cov_target, - regress_target_score, + target_spec.observed_target, + target_spec.cov_target, + target_spec.regress_target_score, sample=(opt_sample, logW), normal_sample=target_sample, initial_guess=MLE_intervals, @@ -299,9 +294,7 @@ def summary(self, return result def selective_MLE(self, - observed_target, - cov_target, - regress_target_score, + target_spec, level=0.9, solve_args={'tol': 1.e-12}): """ @@ -319,18 +312,13 @@ def selective_MLE(self, Arguments passed to solver. """ - return self.sampler.selective_MLE(observed_target, - cov_target, - regress_target_score, + return self.sampler.selective_MLE(target_spec, self.observed_opt_state, level=level, solve_args=solve_args) def posterior(self, - observed_target, - cov_target, - regress_target_score, - dispersion=1, + target_spec, prior=None, solve_args={'tol': 1.e-12}): """ @@ -353,7 +341,7 @@ def posterior(self, """ if prior is None: - Di = 1. / (200 * np.diag(cov_target)) + Di = 1. / (200 * np.diag(target_spec.cov_target)) def prior(target_parameter): grad_prior = -target_parameter * Di @@ -361,18 +349,12 @@ def prior(target_parameter): return log_prior, grad_prior return posterior(self, - observed_target, - cov_target, - regress_target_score, - dispersion, + target_spec, prior, solve_args=solve_args) def approximate_grid_inference(self, - observed_target, - cov_target, - regress_target_score, - alternatives=None, + target_spec, solve_args={'tol': 1.e-12}, useIP=False): @@ -393,12 +375,10 @@ def approximate_grid_inference(self, """ G = approximate_grid_inference(self, - observed_target, - cov_target, - regress_target_score, + target_spec, solve_args=solve_args, useIP=useIP) - return G.summary(alternatives=alternatives) + return G.summary(alternatives=target_spec.alternatives) class multiple_queries(object): @@ -438,10 +418,10 @@ def fit(self): objective.fit() def summary(self, - observed_target, - opt_sampling_info, # a sequence of (cov_target, score_cov) + target_specs, + # a sequence of target_specs # objects in theory all cov_target - # should be about the same... + # should be about the same. as should the observed_target alternatives=None, parameter=None, level=0.9, @@ -471,25 +451,28 @@ def summary(self, Compute confidence intervals? """ + observed_target = target_specs[0].observed_target + alternatives = target_specs[0].alternatives + if parameter is None: parameter = np.zeros_like(observed_target) if alternatives is None: alternatives = ['twosided'] * observed_target.shape[0] - if len(self.objectives) != len(opt_sampling_info): + if len(self.objectives) != len(target_specs): raise ValueError("number of objectives and sampling cov infos do not match") self.opt_sampling_info = [] for i in range(len(self.objectives)): - if opt_sampling_info[i][0] is None or opt_sampling_info[i][1] is None: + if target_specs[i].cov_target is None or target_specs[i].regress_target_score is None: raise ValueError("did not input target and score covariance info") opt_sample, opt_logW = self.objectives[i].sampler.sample(ndraw, burnin) self.opt_sampling_info.append((self.objectives[i].sampler, opt_sample, opt_logW, - opt_sampling_info[i][0], - opt_sampling_info[i][1])) + target_specs[i].cov_target, + target_specs[i].regress_target_score)) pivots = self.coefficient_pvalues(observed_target, parameter=parameter, @@ -568,7 +551,7 @@ def coefficient_pvalues(self, return np.array(pvals) def confidence_intervals(self, - observed_target, + target_specs, sample_args=(), level=0.9): @@ -948,9 +931,7 @@ def sample(self, ndraw, burnin): return _sample, np.zeros(_sample.shape[0]) def selective_MLE(self, - observed_target, - cov_target, - regress_target_score, + target_spec, # initial (observed) value of optimization variables -- # used as a feasible point. # precise value used only for independent estimator @@ -976,13 +957,7 @@ def selective_MLE(self, Arguments passed to solver. """ - # self.M1 = self.M1 * dispersion - # self.M2 = self.M2 * (dispersion**2) - # self.M3 = self.M3 * (dispersion**2) - - return selective_MLE(observed_target, - cov_target, - regress_target_score, + return selective_MLE(target_spec, observed_soln, self.mean, self.covariance, @@ -1335,9 +1310,7 @@ def naive_pvalues(diag_cov, observed, parameter): pvalues[j] = 2 * min(pval, 1 - pval) return pvalues -def selective_MLE(observed_target, - cov_target, - regress_target_score, +def selective_MLE(target_spec, observed_soln, # initial (observed) value of # optimization variables -- used as a # feasible point. precise value used @@ -1387,6 +1360,10 @@ def selective_MLE(observed_target, Use python or C solver. """ + (observed_target, + cov_target, + regress_target_score) = target_spec[:3] + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') diff --git a/selectinf/randomized/screening.py b/selectinf/randomized/screening.py index 3c5df5cd6..0b61626b0 100644 --- a/selectinf/randomized/screening.py +++ b/selectinf/randomized/screening.py @@ -7,12 +7,13 @@ from .query import gaussian_query from .randomization import randomization +from ..base import TargetSpec class screening(gaussian_query): def __init__(self, observed_data, - covariance, + covariance, # unscaled randomizer, perturb=None): @@ -35,33 +36,38 @@ def multivariate_targets(self, """ Entries of the mean of \Sigma[E,E]^{-1}Z_E """ - score_linear = self.covariance[:, features].copy() / dispersion - Q = score_linear[features] - cov_target = np.linalg.inv(Q) + Q = self.covariance[features][:,features] + Qinv = np.linalg.inv(Q) + cov_target = np.linalg.inv(Q) * dispersion observed_target = -np.linalg.inv(Q).dot(self.observed_score_state[features]) - crosscov_target_score = -score_linear.dot(cov_target) + regress_target_score = -Qinv.dot(np.identity(self.covariance.shape[0])[features]) alternatives = ['twosided'] * features.sum() - return (observed_target, - cov_target * dispersion, - crosscov_target_score.T * dispersion, - dispersion, - alternatives) + return TargetSpec(observed_target, + cov_target, + regress_target_score, + alternatives, + dispersion) def full_targets(self, features, dispersion=1): """ - Entries of the mean of \Sigma[E,E]^{-1}Z_E + Entries of the mean of (\Sigma^{-1}Z)[E] """ - score_linear = self.covariance[:, features].copy() / dispersion - Q = self.covariance / dispersion - cov_target = (np.linalg.inv(Q)[features])[:, features] + + Q = self.covariance + Qinv = np.linalg.inv(Q) + cov_target = Qinv[features][:, features] * dispersion observed_target = -np.linalg.inv(Q).dot(self.observed_score_state)[features] - crosscov_target_score = -np.identity(Q.shape[0])[:, features] + regress_target_score = -Qinv[:, features] alternatives = ['twosided'] * features.sum() - return observed_target, cov_target * dispersion, crosscov_target_score.T * dispersion, dispersion, alternatives + return TargetSpec(observed_target, + cov_target, + regress_target_score.T, + alternatives, + dispersion) def marginal_targets(self, features, @@ -69,14 +75,17 @@ def marginal_targets(self, """ Entries of the mean of Z_E """ - score_linear = self.covariance[:, features] - Q = score_linear[features] - cov_target = Q + Q = self.covariance[features][:,features] + cov_target = Q * dispersion observed_target = -self.observed_score_state[features] - crosscov_target_score = -score_linear + regress_target_score = -np.identity(self.covariance.shape[0])[:,features] alternatives = ['twosided'] * features.sum() - return observed_target, cov_target, crosscov_target_score.T, dispersion, alternatives + return TargetSpec(observed_target, + cov_target, + regress_target_score.T, + alternatives, + dispersion) class marginal_screening(screening): diff --git a/selectinf/randomized/tests/test_BH.py b/selectinf/randomized/tests/test_BH.py index 34c26ac5f..59927c56a 100644 --- a/selectinf/randomized/tests/test_BH.py +++ b/selectinf/randomized/tests/test_BH.py @@ -61,16 +61,13 @@ def test_independent_estimator(n=100, n1=50, q=0.2, signal=3, p=100): cov_target = np.identity(selected.sum()) / n cross_cov = -np.identity(p)[selected] / n - (observed_target1, - cov_target1, - cross_cov1, - _) = BH_select.marginal_targets(selected) + target_spec = BH_select.marginal_targets(selected) - assert(np.linalg.norm(observed_target - observed_target1) / + assert(np.linalg.norm(observed_target - target_spec.observed_target) / np.linalg.norm(observed_target) < 1.e-7) - assert(np.linalg.norm(cov_target - cov_target1) / + assert(np.linalg.norm(cov_target - target_spec.cov_target) / np.linalg.norm(cov_target) < 1.e-7) - assert(np.linalg.norm(cross_cov - cross_cov1) / np.linalg.norm(cross_cov) + assert(np.linalg.norm(regress_target_score - target_spec.regress_target_score) / np.linalg.norm(regress_target_score) < 1.e-7) result = BH_select.selective_MLE(observed_target, cov_target, cross_cov)[0] @@ -121,15 +118,9 @@ def test_BH(n=500, if nonzero is not None: if marginal: - (observed_target, - cov_target, - crosscov_target_score, - alternatives) = BH_select.marginal_targets(nonzero) + target_spec = BH_select.marginal_targets(nonzero) else: - (observed_target, - cov_target, - crosscov_target_score, - alternatives) = BH_select.full_targets(nonzero, dispersion=sigma**2) + target_spec = BH_select.full_targets(nonzero, dispersion=sigma**2) if marginal: beta_target = true_mean[nonzero] @@ -137,20 +128,14 @@ def test_BH(n=500, beta_target = beta[nonzero] if use_MLE: - print('huh') - result = BH_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score, + result = BH_select.selective_MLE(target_spec, level=level)[0] estimate = result['MLE'] pivots = ndist.cdf((estimate - beta_target) / result['SE']) pivots = 2 * np.minimum(pivots, 1 - pivots) # run summary else: - result = BH_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, + result = BH_select.summary(target_spec, compute_intervals=True, level=level, ndraw=20000, @@ -174,33 +159,5 @@ def test_both(): test_BH(marginal=True) test_BH(marginal=False) -def main(nsim=500, use_MLE=True, marginal=False): - - import matplotlib.pyplot as plt - import statsmodels.api as sm - U = np.linspace(0, 1, 101) - P0, PA, cover, length_int = [], [], [], [] - Ps = [] - for i in range(nsim): - p0, pA, cover_, intervals, pivots = test_BH(use_MLE=use_MLE, - marginal=marginal) - Ps.extend(pivots) - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - print(np.mean(cover),'coverage so far') - - period = 10 - if use_MLE: - period = 50 - if i % period == 0 and i > 0: - plt.clf() - if len(P0) > 0: - plt.plot(U, sm.distributions.ECDF(P0)(U), 'b', label='null') - plt.plot(U, sm.distributions.ECDF(PA)(U), 'r', label='alt') - plt.plot(U, sm.distributions.ECDF(Ps)(U), 'tab:orange', label='pivot') - plt.plot([0, 1], [0, 1], 'k--') - plt.legend() - plt.savefig('BH_pvals.pdf') diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 1b08b2235..7dc873368 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -1,7 +1,8 @@ import numpy as np from ...tests.instance import gaussian_instance -from ..lasso import lasso, selected_targets +from ..lasso import lasso +from ...base import selected_targets from ..approx_reference import approximate_grid_inference def test_inf(n=500, @@ -12,7 +13,7 @@ def test_inf(n=500, rho=0.4, randomizer_scale=1., equicorrelated=False, - useIP=False, + useIP=True, CI=False): inst, const = gaussian_instance, lasso.gaussian @@ -53,19 +54,14 @@ def test_inf(n=500, if nonzero.sum() > 0: beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + print(target_spec) + approximate_grid_inf = approximate_grid_inference(conv, - observed_target, - cov_target, - regress_target_score, + target_spec, useIP=useIP) if CI is False: diff --git a/selectinf/randomized/tests/test_drop_losers.py b/selectinf/randomized/tests/test_drop_losers.py index 45bd3595d..5f212b740 100644 --- a/selectinf/randomized/tests/test_drop_losers.py +++ b/selectinf/randomized/tests/test_drop_losers.py @@ -109,7 +109,7 @@ def test_compare_topK(p=20, randomizer = randomization.gaussian(np.diag(np.array(full_std)**2 / np.array(n_1)) - covariance) - randomized_topK = topK(full_means, + randomized_topK = topK(np.asarray(full_means), covariance, randomizer, K, @@ -117,11 +117,9 @@ def test_compare_topK(p=20, randomized_topK.fit(perturb=perturb) - (observed_target, - target_cov, - target_score_cov, - _) = randomized_topK.marginal_targets(randomized_topK.selection_variable['variables']) - + target_spec = randomized_topK.marginal_targets(randomized_topK.selection_variable['variables']) + print('var', randomized_topK.selection_variable['variables']) + # try with a degenerate covariance now means2 = df2.groupby('arm').mean()['data'].iloc[range(p)] @@ -135,52 +133,33 @@ def test_compare_topK(p=20, np.array(n_1)) - covariance2) - degenerate_topK = topK(means2, + degenerate_topK = topK(np.asarray(means2), covariance2, degenerate_randomizer, K, perturb=perturb2) np.random.seed(0) - summary1 = randomized_topK.summary(observed_target, - target_cov, - target_score_cov, - alternatives=['twosided']*K, - ndraw=10000, - burnin=2000, - compute_intervals=True) + summary1 = randomized_topK.selective_MLE(target_spec)[0] np.random.seed(0) - summary2 = dtl.summary(ndraw=10000, - burnin=2000) + summary2 = dtl.MLE_inference()[0] + + np.testing.assert_allclose(summary1['MLE'], summary2['MLE'], rtol=1.e-3) np.testing.assert_allclose(summary1['pvalue'], summary2['pvalue'], rtol=1.e-3) - np.testing.assert_allclose(summary1['target'], summary2['target'], rtol=1.e-3) - np.testing.assert_allclose(summary1['lower_confidence'], summary2['lower_confidence'], rtol=1.e-3) - np.testing.assert_allclose(summary1['upper_confidence'], summary2['upper_confidence'], rtol=1.e-3) + #np.testing.assert_allclose(summary1['lower_confidence'], summary2['lower_confidence'], rtol=1.e-3) + #np.testing.assert_allclose(summary1['upper_confidence'], summary2['upper_confidence'], rtol=1.e-3) np.random.seed(0) degenerate_topK.fit(perturb=perturb2) - summary3 = degenerate_topK.summary(observed_target, - target_cov, - target_score_cov, - alternatives=['twosided']*K, + summary3 = degenerate_topK.summary(target_spec, ndraw=10000, burnin=2000, compute_intervals=True) np.testing.assert_allclose(summary1['pvalue'], summary3['pvalue'], rtol=1.e-3) np.testing.assert_allclose(summary1['target'], summary3['target'], rtol=1.e-3) - np.testing.assert_allclose(summary1['lower_confidence'], summary3['lower_confidence'], rtol=1.e-3) - np.testing.assert_allclose(summary1['upper_confidence'], summary3['upper_confidence'], rtol=1.e-3) - + #np.testing.assert_allclose(summary1['lower_confidence'], summary3['lower_confidence'], rtol=1.e-3) + #np.testing.assert_allclose(summary1['upper_confidence'], summary3['upper_confidence'], rtol=1.e-3) -def main(nsim=100, use_MLE=True): - - P0, cover = [], [] - - for i in range(nsim): - p0, cover_ = test_drop_losers(use_MLE=use_MLE) - cover.extend(cover_) - P0.extend(p0) - print('coverage', np.mean(cover)) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 6e7e73f6d..534e4beaf 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -54,18 +54,12 @@ def test_inf(n=500, if nonzero.sum() > 0: beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) exact_grid_inf = exact_grid_inference(conv, - observed_target, - cov_target, - regress_target_score) + target_spec) if CI is False: pivot = exact_grid_inf._pivots(beta_target) diff --git a/selectinf/randomized/tests/test_hiv_data.py b/selectinf/randomized/tests/test_hiv_data.py new file mode 100644 index 000000000..4c3a741e8 --- /dev/null +++ b/selectinf/randomized/tests/test_hiv_data.py @@ -0,0 +1,117 @@ +import numpy as np +import pandas as pd +from scipy.stats import norm as ndist + +from ..lasso import split_lasso +from ..posterior_inference import (langevin_sampler, + gibbs_sampler) + +from ...base import selected_targets +from ...tests.instance import HIV_NRTI +from ...tests.flags import SET_SEED, SMALL_SAMPLES +from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue + +@set_seed_iftrue(SET_SEED) +@set_sampling_params_iftrue(SMALL_SAMPLES, nsample=50, nburnin=10) +def test_hiv_data(nsample=10000, + nburnin=500, + level=0.90, + split_proportion=0.50, + seedn=1): + np.random.seed(seedn) + + alpha = (1 - level) / 2 + Z_quantile = ndist.ppf(1 - alpha) + + X, Y, _ = HIV_NRTI(standardize=True) + Y *= 15 + n, p = X.shape + X /= np.sqrt(n) + + ols_fit = np.linalg.pinv(X).dot(Y) + _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) + + const = split_lasso.gaussian + + dispersion = _sigma ** 2 + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma + + conv = const(X, + Y, + W, + proportion=split_proportion) + + signs = conv.fit() + nonzero = signs != 0 + + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + + mle, inverse_info = conv.selective_MLE(target_spec, + level=level, + solve_args={'tol': 1.e-12})[:2] + + approx_inf = conv.approximate_grid_inference(target_spec, + useIP=True) + + posterior_inf = conv.posterior(target_spec, + dispersion=dispersion) + + samples_langevin = langevin_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin, + step=1.) + + lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) + upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) + + samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin) + + lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) + upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) + + naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) + naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) + naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), + naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T + + X_split = X[~conv._selection_idx, :] + Y_split = Y[~conv._selection_idx] + split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) + split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) + split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), + split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T + + print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", + np.mean(upper_langevin - lower_langevin), + np.mean(upper_gibbs - lower_gibbs), + np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), + np.mean(mle['upper_confidence'] - mle['lower_confidence']), + np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) + ) + + print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) + + print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) + + scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) + output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, + 'Langevin_upper_credible': upper_langevin, + 'Gibbs_lower_credible': lower_gibbs, + 'Gibbs_upper_credible': upper_gibbs, + 'MLE_lower_confidence': mle['lower_confidence'], + 'MLE_upper_confidence': mle['upper_confidence'], + 'approx_lower_confidence': approx_inf['lower_confidence'], + 'approx_upper_confidence': approx_inf['upper_confidence'], + 'Split_lower_confidence': split_intervals[:, 0], + 'Split_upper_confidence': split_intervals[:, 1], + 'Naive_lower_confidence': naive_intervals[:, 0], + 'Naive_upper_confidence': naive_intervals[:, 1] + }) + + return output, scale_interval, _sigma + diff --git a/selectinf/randomized/tests/test_lasso.py b/selectinf/randomized/tests/test_lasso.py index 1ba443c29..07d1e9989 100644 --- a/selectinf/randomized/tests/test_lasso.py +++ b/selectinf/randomized/tests/test_lasso.py @@ -60,32 +60,17 @@ def test_highdim_lasso(n=500, nonzero = signs != 0 if target == 'full': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln) + target_spec = full_targets(conv.loglike, + conv.observed_soln) elif target == 'selected': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln) + target_spec = selected_targets(conv.loglike, + conv.observed_soln) elif target == 'debiased': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = debiased_targets(conv.loglike, - conv.observed_soln, - penalty=conv.penalty) - - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + target_spec = debiased_targets(conv.loglike, + conv.observed_soln, + penalty=conv.penalty) + + result = conv.summary(target_spec, ndraw=ndraw, burnin=burnin, compute_intervals=True) @@ -150,32 +135,17 @@ def test_AR_randomization(n=300, nonzero = signs != 0 if target == 'full': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln) + target_spec = full_targets(conv.loglike, + conv.observed_soln) elif target == 'selected': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln) + target_spec = selected_targets(conv.loglike, + conv.observed_soln) elif target == 'debiased': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = debiased_targets(conv.loglike, - conv.observed_soln, - penalty=conv.penalty) - - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + target_spec = debiased_targets(conv.loglike, + conv.observed_soln, + penalty=conv.penalty) + + result = conv.summary(target_spec, ndraw=ndraw, burnin=burnin, compute_intervals=True) @@ -275,24 +245,13 @@ def test_sqrt_highdim_lasso(n=500, np.testing.assert_allclose(soln, soln3) if full: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln) + target_spec = full_targets(conv.loglike, + conv.observed_soln) else: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln) + target_spec = selected_targets(conv.loglike, + conv.observed_soln) - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + result = conv.summary(target_spec, ndraw=ndraw, burnin=burnin, compute_intervals=False) @@ -427,23 +386,12 @@ def test_logistic_lasso(n=500, # sanity check if full: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln) + target_spec = full_targets(conv.loglike, + conv.observed_soln) else: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, + target_spec = selected_targets(conv.loglike, conv.observed_soln) - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + result = conv.summary(target_spec, ndraw=ndraw, burnin=burnin, compute_intervals=False) diff --git a/selectinf/randomized/tests/test_marginal_screening.py b/selectinf/randomized/tests/test_marginal_screening.py index e8dac39aa..50c769fb6 100644 --- a/selectinf/randomized/tests/test_marginal_screening.py +++ b/selectinf/randomized/tests/test_marginal_screening.py @@ -46,28 +46,15 @@ def test_marginal(n=500, if marginal: - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = marginal_select.marginal_targets(nonzero) + target_spec = marginal_select.marginal_targets(nonzero) else: - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = marginal_select.multivariate_targets(nonzero, dispersion=sigma**2) + target_spec = marginal_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: - result = marginal_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score)[0] + result = marginal_select.selective_MLE(target_spec)[0] # run summary else: - result = marginal_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, + result = marginal_select.summary(target_spec, compute_intervals=True) intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -136,28 +123,19 @@ def test_simple(n=100, if nonzero.sum() > 0: - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = marginal_select.marginal_targets(nonzero) + target_spec = marginal_select.marginal_targets(nonzero) if use_MLE: - result = marginal_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score) + result = marginal_select.selective_MLE(target_spec) # run summary else: - result = marginal_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, + result = marginal_select.summary(target_spec, compute_intervals=True) pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) print(pval) - beta_target = cov_target.dot(true_mean[nonzero]) + beta_target = target_spec.cov_target.dot(true_mean[nonzero]) print("beta_target and intervals", beta_target, intervals) coverage = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]) print("coverage for selected target", coverage.sum()/float(nonzero.sum())) @@ -166,25 +144,3 @@ def test_simple(n=100, def test_both(): test_marginal(marginal=True) test_marginal(marginal=False) - -def main(nsim=1000, test_fn=test_marginal, use_MLE=False): - - import matplotlib.pyplot as plt - import statsmodels.api as sm - U = np.linspace(0, 1, 101) - P0, PA, cover, length_int = [], [], [], [] - for i in range(nsim): - p0, pA, cover_, intervals = test_fn(use_MLE=use_MLE) - - cover.extend(cover_) - P0.extend(p0) - PA.extend(pA) - print(np.mean(cover),'coverage so far') - - if i % 50 == 0 and i > 0: - plt.clf() - plt.plot(U, sm.distributions.ECDF(P0)(U), 'b', label='null') - plt.plot(U, sm.distributions.ECDF(PA)(U), 'r', label='alt') - plt.plot([0, 1], [0, 1], 'k--') - plt.savefig('marginal_screening_pvals.pdf') - diff --git a/selectinf/randomized/tests/test_multiple_queries.py b/selectinf/randomized/tests/test_multiple_queries.py index 267b7e53b..a56a8a440 100644 --- a/selectinf/randomized/tests/test_multiple_queries.py +++ b/selectinf/randomized/tests/test_multiple_queries.py @@ -62,25 +62,15 @@ def test_multiple_queries(n=500, if nonzero.sum() == 0: return [], [] - (observed_target1, - cov_target1, - cov_target_score1, - dispersion1, - alternatives1) = conv1.multivariate_targets(nonzero, sigma**2) - - (observed_target2, - cov_target2, - cov_target_score2, - dispersion2, - alternatives2) = selected_targets(conv2.loglike, - conv2.observed_soln, - features=nonzero) + target_spec1 = conv1.multivariate_targets(nonzero, sigma**2) + + target_spec2 = selected_targets(conv2.loglike, + conv2.observed_soln, + features=nonzero) mq = multiple_queries([conv1, conv2]) - results = mq.summary(observed_target1, - [(cov_target1, cov_target_score1), - (cov_target2, cov_target_score2)], + results = mq.summary([target_spec1, target_spec2], compute_intervals=True) pval = np.asarray(results['pvalue']) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0] diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 66780a5b4..b6c1c8ddb 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -53,19 +53,12 @@ def test_Langevin(n=500, beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - posterior_inf = conv.posterior(observed_target, - cov_target, - regress_target_score, + target_spec = selected_targets(conv.loglike, + conv.observed_soln, dispersion=dispersion) + posterior_inf = conv.posterior(target_spec) + samples = langevin_sampler(posterior_inf, nsample=nsample, nburnin=nburnin) @@ -106,6 +99,7 @@ def test_coverage(nsim=100, @set_seed_iftrue(SET_SEED) @set_sampling_params_iftrue(SMALL_SAMPLES, nsample=50, nburnin=10) def test_instance(nsample=100, nburnin=50): + np.random.seed(10) n, p, s = 500, 100, 5 X = np.random.standard_normal((n, p)) beta = np.zeros(p) @@ -122,19 +116,13 @@ def test_instance(nsample=100, nburnin=50): M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives)= selected_targets(L.loglike, - L.observed_soln, - features=M, - dispersion=dispersion) - - posterior_inf = L.posterior(observed_target, - cov_target, - regress_target_score, - dispersion=dispersion) + target_spec = selected_targets(L.loglike, + L.observed_soln, + features=M, + dispersion=dispersion) + print(target_spec.dispersion, dispersion) + + posterior_inf = L.posterior(target_spec) samples = langevin_sampler(posterior_inf, nsample=nsample, @@ -178,18 +166,14 @@ def test_flexible_prior1(nsample=100, M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(L.loglike, - L.observed_soln, - features=M, - dispersion=dispersion) + target_spec = selected_targets(L.loglike, + L.observed_soln, + features=M, + dispersion=dispersion) # default prior - Di = 1. / (200 * np.diag(cov_target)) + Di = 1. / (200 * np.diag(target_spec.cov_target)) def prior(target_parameter): grad_prior = -target_parameter * Di @@ -200,10 +184,7 @@ def prior(target_parameter): np.random.set_state(seed_state) Z1 = np.random.standard_normal() - posterior_inf1 = L.posterior(observed_target, - cov_target, - regress_target_score, - dispersion=dispersion, + posterior_inf1 = L.posterior(target_spec, prior=prior) W1 = np.random.standard_normal() @@ -213,10 +194,7 @@ def prior(target_parameter): np.random.set_state(seed_state) Z2 = np.random.standard_normal() - posterior_inf2 = L.posterior(observed_target, - cov_target, - regress_target_score, - dispersion=dispersion) + posterior_inf2 = L.posterior(target_spec) W2 = np.random.standard_normal() samples2 = langevin_sampler(posterior_inf2, @@ -248,14 +226,10 @@ def test_flexible_prior2(nsample=1000, nburnin=50): M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(L.loglike, - L.observed_soln, - features=M, - dispersion=dispersion) + target_spec = selected_targets(L.loglike, + L.observed_soln, + features=M, + dispersion=dispersion) prior_var = 0.05 ** 2 @@ -264,11 +238,8 @@ def prior(target_parameter): log_prior = -np.linalg.norm(target_parameter) ** 2 / (2. * prior_var) return log_prior, grad_prior - posterior_inf = L.posterior(observed_target, - cov_target, - regress_target_score, - dispersion=dispersion, - prior=prior) + posterior_inf = L.posterior(target_spec, + prior=prior) adaptive_proposal = np.linalg.inv(np.linalg.inv(posterior_inf.inverse_info) + np.identity(posterior_inf.inverse_info.shape[0]) / 0.05 ** 2) @@ -279,118 +250,4 @@ def prior(target_parameter): return samples -@set_seed_iftrue(SET_SEED) -@set_sampling_params_iftrue(SMALL_SAMPLES, nsample=50, nburnin=10) -def test_hiv_data(nsample=10000, - nburnin=500, - level=0.90, - split_proportion=0.50, - seedn=1): - np.random.seed(seedn) - - alpha = (1 - level) / 2 - Z_quantile = ndist.ppf(1 - alpha) - - X, Y, _ = HIV_NRTI(standardize=True) - Y *= 15 - n, p = X.shape - X /= np.sqrt(n) - - ols_fit = np.linalg.pinv(X).dot(Y) - _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) - - const = split_lasso.gaussian - - dispersion = _sigma ** 2 - - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma - - conv = const(X, - Y, - W, - proportion=split_proportion) - - signs = conv.fit() - nonzero = signs != 0 - - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - mle, inverse_info = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, - level=level, - solve_args={'tol': 1.e-12})[:2] - - approx_inf = conv.approximate_grid_inference(observed_target, - cov_target, - regress_target_score, - useIP=False) - - posterior_inf = conv.posterior(observed_target, - cov_target, - regress_target_score, - dispersion=dispersion) - - samples_langevin = langevin_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin, - step=1.) - - lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) - upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) - - samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin) - - lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) - upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) - - naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) - naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) - naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), - naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T - - X_split = X[~conv._selection_idx, :] - Y_split = Y[~conv._selection_idx] - split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) - split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) - split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), - split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T - - print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", - np.mean(upper_langevin - lower_langevin), - np.mean(upper_gibbs - lower_gibbs), - np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), - np.mean(mle['upper_confidence'] - mle['lower_confidence']), - np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) - ) - - print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) - - print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) - - scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) - output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, - 'Langevin_upper_credible': upper_langevin, - 'Gibbs_lower_credible': lower_gibbs, - 'Gibbs_upper_credible': upper_gibbs, - 'MLE_lower_confidence': mle['lower_confidence'], - 'MLE_upper_confidence': mle['upper_confidence'], - 'approx_lower_confidence': approx_inf['lower_confidence'], - 'approx_upper_confidence': approx_inf['upper_confidence'], - 'Split_lower_confidence': split_intervals[:, 0], - 'Split_upper_confidence': split_intervals[:, 1], - 'Naive_lower_confidence': naive_intervals[:, 0], - 'Naive_upper_confidence': naive_intervals[:, 1] - }) - - return output, scale_interval, _sigma - diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 818cdc012..552d2b9ce 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -65,29 +65,18 @@ def test_full_targets(n=200, dispersion = None if n > p: - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln, - nonzero, - dispersion=dispersion) + target_spec = full_targets(conv.loglike, + conv.observed_soln, + nonzero, + dispersion=dispersion) else: - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = debiased_targets(conv.loglike, - conv.observed_soln, - nonzero, - penalty=conv.penalty, - dispersion=dispersion) - - result = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, - dispersion)[0] + target_spec = debiased_targets(conv.loglike, + conv.observed_soln, + nonzero, + penalty=conv.penalty, + dispersion=dispersion) + + result = conv.selective_MLE(target_spec)[0] pval = result['pvalue'] estimate = result['MLE'] @@ -147,17 +136,11 @@ def test_selected_targets(n=2000, if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - result = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + + result = conv.selective_MLE(target_spec, dispersion)[0] pval = result['pvalue'] @@ -188,20 +171,14 @@ def test_instance(): print("check ", M) dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(L.loglike, - L.observed_soln, - features=M, - dispersion=dispersion) + target_spec = selected_targets(L.loglike, + L.observed_soln, + features=M, + dispersion=dispersion) - print("check shapes", observed_target.shape, E.sum()) + print("check shapes", target_spec.observed_target.shape, E.sum()) - result = L.selective_MLE(observed_target, - cov_target, - regress_target_score, + result = L.selective_MLE(target_spec, dispersion)[0] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -256,17 +233,11 @@ def test_selected_targets_disperse(n=500, if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - result = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + + result = conv.selective_MLE(target_spec, dispersion)[0] pval = result['pvalue'] @@ -317,17 +288,11 @@ def test_logistic(n=2000, if nonzero.sum() > 0: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -373,17 +338,11 @@ def test_logistic_split(n=2000, if nonzero.sum() > 0: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -429,17 +388,11 @@ def test_poisson(n=2000, if nonzero.sum() > 0: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -485,17 +438,11 @@ def test_poisson_split(n=2000, if nonzero.sum() > 0: - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -544,17 +491,11 @@ def test_cox(n=2000, cox_full = rr.glm.cox(X, T, S) full_hess = cox_full.hessian(conv.observed_soln) - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -603,17 +544,11 @@ def test_cox_split(n=2000, cox_full = rr.glm.cox(X, T, S) full_hess = cox_full.hessian(conv.observed_soln) - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=1) - - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=1) + + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -669,22 +604,16 @@ def test_scale_invariant_split(n=200, print('feature_weights', conv.feature_weights[0] / scale) dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - print('dispersion', dispersion/scale**2) - print('target', observed_target[0]/scale) - print('cov_target', cov_target[0,0]/scale**2) - print('cov_target_score', cov_target_score[0,0]/scale**2) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + + print('dispersion', target_spec.dispersion/scale**2) + print('target', target_spec.observed_target[0]/scale) + print('cov_target', target_spec.cov_target[0,0]/scale**2) + print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + result = conv.selective_MLE(target_spec)[0] print(result['MLE'] / scale) results.append(result) @@ -751,22 +680,16 @@ def test_scale_invariant(n=200, print('perturb', conv._initial_omega[0] / scale) dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) - - print('dispersion', dispersion/scale**2) - print('target', observed_target[0]/scale) - print('cov_target', cov_target[0,0]/scale**2) - print('cov_target_score', cov_target_score[0,0]/scale**2) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) + + print('dispersion', target_spec.dispersion/scale**2) + print('target', target_spec.observed_target[0]/scale) + print('cov_target', target_spec.cov_target[0,0]/scale**2) + print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + result = conv.selective_MLE(target_spec)[0] print(result['MLE'] / scale) results.append(result) diff --git a/selectinf/randomized/tests/test_selective_MLE_onedim.py b/selectinf/randomized/tests/test_selective_MLE_onedim.py index 33599a725..dd7ded2ff 100644 --- a/selectinf/randomized/tests/test_selective_MLE_onedim.py +++ b/selectinf/randomized/tests/test_selective_MLE_onedim.py @@ -4,7 +4,8 @@ from scipy.stats import norm as ndist import nose.tools as nt -from ..lasso import lasso, full_targets +from ..lasso import lasso +from ...base import full_targets, TargetSpec from ...tests.instance import gaussian_instance def test_onedim_lasso(n=50000, W=1.5, signal=2., sigma=2, randomizer_scale=1): @@ -28,18 +29,11 @@ def test_onedim_lasso(n=50000, W=1.5, signal=2., sigma=2, randomizer_scale=1): # this is current code where we estimate sigma - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv._W, - nonzero) + target_spec = full_targets(conv.loglike, + conv.observed_soln, + features=nonzero) - result = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, - np.ones((1,)) * signs[0]) + result = conv.selective_MLE(target_spec) estimate_cur = float(result[0]['MLE']) Z_cur = float(result[0]['Zvalue']) @@ -50,11 +44,13 @@ def test_onedim_lasso(n=50000, W=1.5, signal=2., sigma=2, randomizer_scale=1): target_Z = X.T.dot(Y) / np.sqrt((X**2).sum(0)) - result2 = conv.sampler.selective_MLE(target_Z, - sigma**2 * np.ones((1,1)), - -np.ones((1,1)) / np.sqrt((X**2).sum(0)), + target = TargetSpec(target_Z, + sigma**2 * np.ones((1,1)), + -np.ones((1,1)) / np.sqrt((X**2).sum(0)), + ['greater'], + sigma**2) + result2 = conv.sampler.selective_MLE(target, np.ones((1,)) * signs[0], - dispersion=sigma**2, solve_args={'tol':1.e-12}) estimate, I, Z, pv = (float(result2[0]['MLE']), result2[1], @@ -75,7 +71,7 @@ def test_onedim_lasso(n=50000, W=1.5, signal=2., sigma=2, randomizer_scale=1): pivot = ndist.cdf((estimate_cur - signal) / np.sqrt(I_cur[0,0])) - debug = Falsee + debug = False if debug: print(estimate, approx_MLE, 'selective MLE') print(beta[nonzero], 'truth') diff --git a/selectinf/randomized/tests/test_slope.py b/selectinf/randomized/tests/test_slope.py index 65cc553c7..1545d0f47 100644 --- a/selectinf/randomized/tests/test_slope.py +++ b/selectinf/randomized/tests/test_slope.py @@ -152,21 +152,13 @@ def test_randomized_slope(n=2000, if nonzero.sum() > 0: if target == 'full': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = full_targets(conv.loglike, - conv.observed_soln, - dispersion=sigma_) + target_spec = full_targets(conv.loglike, + conv.observed_soln, + dispersion=sigma_) elif target == 'selected': - (observed_target, - cov_target, - cov_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=sigma_) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=sigma_) if target == "selected": beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -174,14 +166,9 @@ def test_randomized_slope(n=2000, beta_target = beta[nonzero] if use_MLE: - result = conv.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + result = conv.selective_MLE(target_spec)[0] else: - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + result = conv.summary(target_spec, compute_intervals=True, ndraw=150000) pval = np.asarray(result['pvalue']) diff --git a/selectinf/randomized/tests/test_split_lasso.py b/selectinf/randomized/tests/test_split_lasso.py index f994c05cc..0e0bd855e 100644 --- a/selectinf/randomized/tests/test_split_lasso.py +++ b/selectinf/randomized/tests/test_split_lasso.py @@ -7,10 +7,10 @@ import regreg.api as rr -from ..lasso import (split_lasso, - selected_targets, +from ..lasso import split_lasso +from ...base import (selected_targets, full_targets, - debiased_targets) + debiased_targets) from ...tests.instance import gaussian_instance from ...tests.flags import SET_SEED from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue @@ -66,44 +66,26 @@ def test_split_lasso(n=100, if nonzero.sum() > 0: if target == 'full': - (observed_target, - cov_target, - cov_target_score, - alternatives) = full_targets(conv.loglike, - conv._W, - nonzero, - dispersion=sigma**2) + target_spec = full_targets(conv.loglike, + conv.observed_soln, + dispersion=sigma**2) elif target == 'selected': - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero) #, - #dispersion=sigma**2) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=sigma**2) elif target == 'debiased': - (observed_target, - cov_target, - cov_target_score, - alternatives) = debiased_targets(conv.loglike, - conv._W, - nonzero, - penalty=conv.penalty, - dispersion=sigma**2) - - result = conv.summary(observed_target, - cov_target, - cov_target_score, - alternatives, + target_spec = debiased_targets(conv.loglike, + conv.observed_soln, + penalty=conv.penalty, + dispersion=sigma**2) + + result = conv.summary(target_spec, ndraw=ndraw, burnin=burnin, compute_intervals=False) - MLE_result, observed_info_mean, _ = conv.selective_MLE( - observed_target, - cov_target, - cov_target_score) + MLE_result, observed_info_mean, _ = conv.selective_MLE(target_spec) final_estimator = np.asarray(MLE_result['MLE']) pval = np.asarray(result['pvalue']) @@ -134,32 +116,3 @@ def test_all_targets(n=100, p=20, signal_fac=1.5, s=5, sigma=3, rho=0.4): rho=rho, target=target) -def main(nsim=500, n=100, p=200, target='selected', sigma=3, s=3): - - import matplotlib.pyplot as plt - P0, PA = [], [] - from statsmodels.distributions import ECDF - - for i in range(nsim): - p0, pA = test_split_lasso(n=n, p=p, target=target, sigma=sigma, s=s) - print(len(p0), len(pA)) - if not (len(pA) < s and target=='selected'): - P0.extend(p0) - PA.extend(pA) - - P0_clean = np.array(P0) - - P0_clean = P0_clean[P0_clean > 1.e-5] # - print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.sum(np.array(PA) < 0.05) / (i+1), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure') - - if i % 3 == 0 and i > 0: - U = np.linspace(0, 1, 101) - plt.clf() - if len(P0_clean) > 0: - plt.plot(U, ECDF(P0_clean)(U)) - if len(PA) > 0: - plt.plot(U, ECDF(PA)(U), 'r') - plt.plot([0, 1], [0, 1], 'k--') - plt.savefig("plot.pdf") - plt.show() - diff --git a/selectinf/randomized/tests/test_standalone_lasso_mle.py b/selectinf/randomized/tests/test_standalone_lasso_mle.py index 4151fa8a4..5482460da 100644 --- a/selectinf/randomized/tests/test_standalone_lasso_mle.py +++ b/selectinf/randomized/tests/test_standalone_lasso_mle.py @@ -5,9 +5,10 @@ import regreg.api as rr -from selectinf.randomized.lasso import split_lasso, selected_targets -from selectinf.randomized.query import selective_MLE -from selectinf.randomized.approx_reference import approximate_grid_inference +from ..lasso import split_lasso +from ...base import selected_targets, TargetSpec +from ..query import selective_MLE +from ..approx_reference import approximate_grid_inference def test_standalone_inference(n=2000, p=100, @@ -46,20 +47,14 @@ def test_standalone_inference(n=2000, full_hess = cox_full.hessian(padded_soln) selected_hess = full_hess[nonzero][:,nonzero] - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(cox_lasso.loglike, - None, - nonzero, - hessian=full_hess, - dispersion=1) + target_spec = selected_targets(cox_lasso.loglike, + cox_lasso.observed_soln, + hessian=full_hess, + dispersion=1) if nonzero.sum(): if approx: - approx_result = cox_lasso.approximate_grid_inference(observed_target, - cov_target, - cov_target_score) + approx_result = cox_lasso.approximate_grid_inference(target_spec) approx_pval = approx_result['pvalue'] testval = approximate_normalizer_inference(proportion, @@ -75,9 +70,7 @@ def test_standalone_inference(n=2000, approx_pval = np.empty(nonzero.sum())*np.nan if MLE: - MLE_result = cox_lasso.selective_MLE(observed_target, - cov_target, - cov_target_score)[0] + MLE_result = cox_lasso.selective_MLE(target_spec)[0] MLE_pval = MLE_result['pvalue'] else: MLE_pval = np.empty(nonzero.sum())*np.nan @@ -125,9 +118,12 @@ def approximate_mle_inference(training_proportion, target_score_cov = -np.identity(nselect) observed_target = selected_beta_refit - result = selective_MLE(observed_target, - target_cov, - target_score_cov, + target_spec = selected_targets(cox_lasso.loglike, + cox_lasso.observed_soln, + hessian=full_hess, + dispersion=1) + + result = selective_MLE(target_spec, training_betahat * selected_signs, cond_mean, cond_cov, @@ -168,9 +164,12 @@ def approximate_normalizer_inference(training_proportion, target_score_cov = -np.identity(nselect) observed_target = selected_beta_refit - inverse_info = selective_MLE(observed_target, - target_cov, - target_score_cov, + target = TargetSpec(observed_target, + target_cov, + target_score_cov, + None) + + inverse_info = selective_MLE(target_spec, training_betahat * selected_signs, cond_mean, cond_cov, @@ -180,9 +179,7 @@ def approximate_normalizer_inference(training_proportion, level=level, useC=True)[1] - G = approximate_grid_inference(observed_target, - target_cov, - target_score_cov, + G = approximate_grid_inference(target_spec, inverse_info, training_betahat * selected_signs, cond_mean, diff --git a/selectinf/randomized/tests/test_topK.py b/selectinf/randomized/tests/test_topK.py index 45dbb54b9..2c1def227 100644 --- a/selectinf/randomized/tests/test_topK.py +++ b/selectinf/randomized/tests/test_topK.py @@ -46,28 +46,15 @@ def test_topK(n=500, if nonzero.sum() > 0: if marginal: - (observed_target, - cov_target, - crosscov_target_score, - dipsersion, - alternatives) = topK_select.marginal_targets(nonzero) + target_spec = topK_select.marginal_targets(nonzero) else: - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) + target_spec = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) if use_MLE: - result = topK_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score)[0] + result = topK_select.selective_MLE(target_spec)[0] # run summary else: - result = topK_select.summary(observed_target, - cov_target, - crosscov_target_score, - alternatives, + result = topK_select.summary(target_spec, compute_intervals=True) lower = np.asarray(result['lower_confidence']) upper = np.asarray(result['upper_confidence']) @@ -128,22 +115,12 @@ def test_bias_topK(n=500, if marginal: beta_target = true_mean[nonzero] - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = topK_select.marginal_targets(nonzero) + target_spec = topK_select.marginal_targets(nonzero) else: beta_target = beta[nonzero] - (observed_target, - cov_target, - crosscov_target_score, - dispersion, - alternatives) = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) + target_spec = topK_select.multivariate_targets(nonzero, dispersion=sigma**2) - result = topK_select.selective_MLE(observed_target, - cov_target, - crosscov_target_score)[0] + result = topK_select.selective_MLE(target_spec)[0] bias_mle = np.asarray(result['MLE'])-beta_target bias_indest = np.asarray(result['unbiased'])-beta_target @@ -151,45 +128,3 @@ def test_bias_topK(n=500, return bias_mle, bias_indest - -# def main(nsim=5000, use_MLE=False): -# -# import matplotlib.pyplot as plt -# import statsmodels.api as sm -# U = np.linspace(0, 1, 101) -# -# P0, PA, cover, length_int = [], [], [], [] -# for i in range(nsim): -# p0, pA, cover_, intervals = test_topK(use_MLE=use_MLE) -# -# cover.extend(cover_) -# P0.extend(p0) -# PA.extend(pA) -# print(np.mean(cover),'coverage so far') -# -# period = 10 -# if use_MLE: -# period = 50 -# if i % period == 0 and i > 0: -# plt.clf() -# plt.plot(U, sm.distributions.ECDF(P0)(U), 'b', label='null') -# plt.plot(U, sm.distributions.ECDF(PA)(U), 'r', label='alt') -# plt.plot([0, 1], [0, 1], 'k--') -# plt.legend() -# plt.savefig('topK_pvals.pdf') - - -def main(nsim=500): - _bias_mle = [] - _bias_indest = [] - - for i in range(nsim): - bias_mle, bias_indest = test_bias_topK() - _bias_mle.extend(bias_mle) - _bias_indest.extend(bias_indest) - - print(np.mean(_bias_mle), np.mean(_bias_indest), 'bias so far: mle and independent estimate ') - - -if __name__ == "__main__": - main(nsim=500) diff --git a/selectinf/randomized/tests/test_unbiased_estimates.py b/selectinf/randomized/tests/test_unbiased_estimates.py index eb8beac0d..a7a91cb41 100644 --- a/selectinf/randomized/tests/test_unbiased_estimates.py +++ b/selectinf/randomized/tests/test_unbiased_estimates.py @@ -1,6 +1,7 @@ import numpy as np -from ..lasso import lasso, selected_targets +from ..lasso import lasso +from ...base import selected_targets from ...tests.instance import gaussian_instance def UMVU(query, @@ -31,8 +32,8 @@ def UMVU(query, _prec = np.linalg.inv(implied_cov[:n][:, :n]) linear_coef = (np.linalg.pinv(X[:, feat]).dot(_prec)) - offset = -np.linalg.pinv(X[:, feat]).dot(X.dot(query.initial_subgrad) - - _prec.dot(implied_cov[:n][:, n:]).dot(query.opt_linear.T.dot(query.initial_subgrad))) * (randomizer_prec) + offset = -np.linalg.pinv(X[:, feat]).dot(X.dot(query.observed_subgrad) + - _prec.dot(implied_cov[:n][:, n:]).dot(query.opt_linear.T.dot(query.observed_subgrad))) * (randomizer_prec) linear_coef *= dispersion offset *= dispersion @@ -47,24 +48,20 @@ def EST(query, feat, dispersion): - (observed_target, - cov_target, - cov_target_score, - alternatives) = selected_targets(query.loglike, - query._W, - feat, - dispersion=dispersion) + target_spec = selected_targets(query.loglike, + query.observed_soln, + dispersion=dispersion) _, randomizer_prec = query.randomizer.cov_prec cond_cov = query.cond_cov logdens_linear = query.sampler.logdens_transform[0] cond_mean = query.cond_mean - prec_target = np.linalg.inv(cov_target) + prec_target = np.linalg.inv(target_spec.cov_target) prec_opt = np.linalg.inv(cond_cov) - target_linear = cov_target_score.T.dot(prec_target) - target_offset = (-X.T.dot(Y) + query.initial_subgrad) - target_linear.dot(observed_target) + target_linear = target_spec.regress_target_score.T.dot(prec_target) #XXX problem here just switched cov_target_score to regress_target_score + target_offset = (-X.T.dot(Y) + query.observed_subgrad) - target_linear.dot(target_spec.observed_target) target_lin = - logdens_linear.dot(target_linear) target_off = cond_mean - target_lin.dot(observed_target) @@ -141,10 +138,3 @@ def test_UMVU(n=500, print("check ", np.allclose(est-umvu, np.zeros(est.shape[0]), atol=1e-03), est-umvu) return umvu, est - -def main(): - - test_UMVU(n=100, p=400, s=5) - -if __name__ == "__main__": - main() From 3320bd8a5ccfb613890659a709c0ba5694898e72 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 31 Aug 2021 00:16:52 -0400 Subject: [PATCH 141/187] test scale for posterior log likelihood --- selectinf/randomized/approx_reference.py | 8 +- selectinf/randomized/query.py | 104 +++++++-------- .../randomized/tests/test_approx_reference.py | 62 ++++++++- selectinf/randomized/tests/test_posterior.py | 120 ++++++++++++++++++ 4 files changed, 236 insertions(+), 58 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 7d10c4ef1..31c3b88e3 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -172,7 +172,7 @@ def _construct_families(self): self._construct_density() self._families = [] - + _log_ref = np.zeros((self.ntarget, 1000)) for m in range(self.ntarget): observed_target_uni = (self.observed_target[m]).reshape((1,)) @@ -189,6 +189,7 @@ def _construct_families(self): logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) logW -= logW.max() + _log_ref[m,:] = logW self._families.append(discrete_family(self.stat_grid[m], np.exp(logW))) else: @@ -204,10 +205,11 @@ def _construct_families(self): 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) logW -= logW.max() + _log_ref[m, :] = logW self._families.append(discrete_family(grid, np.exp(logW))) - + self._log_ref = _log_ref # construction of families follows `selectinf.learning.core` # logG = - 0.5 * grid**2 / var_target @@ -253,7 +255,7 @@ def _approx_pivots(self, pivot.append(_cdf) else: raise ValueError('alternative should be in ["twosided", "less", "greater"]') - return pivot + return pivot, self._log_ref def _approx_intervals(self, level=0.9): diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index b2cd82373..a7d1908f9 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -106,59 +106,59 @@ def fit(self, perturb=None): # Private methods - def _setup_sampler(self, - linear_part, - offset, - opt_linear, - observed_subgrad, - dispersion=1): - - A, b = linear_part, offset - if not np.all(A.dot(self.observed_opt_state) - b <= 0): - raise ValueError('constraints not satisfied') - - (cond_mean, - cond_cov, - cond_precision, - regress_opt, - M1, - M2, - M3) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion=dispersion) + def _setup_sampler(self, + linear_part, + offset, + opt_linear, + observed_subgrad, + dispersion=1): + + A, b = linear_part, offset + if not np.all(A.dot(self.observed_opt_state) - b <= 0): + raise ValueError('constraints not satisfied') + + (cond_mean, + cond_cov, + cond_precision, + regress_opt, + M1, + M2, + M3) = self._setup_implied_gaussian(opt_linear, + observed_subgrad, + dispersion=dispersion) + + def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad + if score.ndim == 1: + mean_term = regress_opt.dot(score.T + u).T + else: + mean_term = regress_opt.dot(score.T + u[:, None]).T + arg = opt - mean_term + return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad - if score.ndim == 1: - mean_term = regress_opt.dot(score.T + u).T - else: - mean_term = regress_opt.dot(score.T + u[:, None]).T - arg = opt - mean_term - return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - - log_density = functools.partial(log_density, - regress_opt, - observed_subgrad, - cond_precision) - - self.cond_mean, self.cond_cov = cond_mean, cond_cov - - affine_con = constraints(A, - b, - mean=cond_mean, - covariance=cond_cov) - - self.sampler = affine_gaussian_sampler(affine_con, - self.observed_opt_state, - self.observed_score_state, - log_density, - regress_opt, # not needed? - observed_subgrad, - opt_linear, # L - M1, - M2, - M3, - selection_info=self.selection_variable, - useC=self.useC) + log_density = functools.partial(log_density, + regress_opt, + observed_subgrad, + cond_precision) + + self.cond_mean, self.cond_cov = cond_mean, cond_cov + + affine_con = constraints(A, + b, + mean=cond_mean, + covariance=cond_cov) + + self.sampler = affine_gaussian_sampler(affine_con, + self.observed_opt_state, + self.observed_score_state, + log_density, + regress_opt, # not needed? + observed_subgrad, + opt_linear, # L + M1, + M2, + M3, + selection_info=self.selection_variable, + useC=self.useC) def _setup_implied_gaussian(self, opt_linear, diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 7dc873368..d0351e017 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -5,7 +5,8 @@ from ...base import selected_targets from ..approx_reference import approximate_grid_inference -def test_inf(n=500, +def test_inf(seedn, + n=500, p=100, signal_fac=1., s=5, @@ -16,6 +17,7 @@ def test_inf(n=500, useIP=True, CI=False): + np.random.seed(seedn) inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) @@ -65,9 +67,9 @@ def test_inf(n=500, useIP=useIP) if CI is False: - pivot = approximate_grid_inf._approx_pivots(beta_target) + pivot, log_ref = approximate_grid_inf._approx_pivots(beta_target) - return pivot + return pivot, log_ref else: lci, uci = approximate_grid_inf._approx_intervals(level=0.90) beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) @@ -77,3 +79,57 @@ def test_inf(n=500, return np.mean(coverage), np.mean(length) +def main(nsim=300, CI = False): + + import matplotlib as mpl + mpl.use('tkagg') + import matplotlib.pyplot as plt + from statsmodels.distributions.empirical_distribution import ECDF + + if CI is False: + _pivot = [] + for i in range(nsim): + _pivot.extend(test_inf(n=100, + p=400, + signal_fac=0.5, + s=0, + sigma=2., + rho=0.30, + randomizer_scale=1., + equicorrelated=True, + useIP=False, + CI=False)) + + print("iteration completed ", i) + + plt.clf() + ecdf_MLE = ECDF(np.asarray(_pivot)) + grid = np.linspace(0, 1, 101) + plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') + plt.plot(grid, grid, 'k--') + plt.show() + + if CI is True: + coverage_ = 0. + length_ = 0. + for n in range(nsim): + cov, len = test_inf(n=100, + p=400, + signal_fac=0.5, + s=5, + sigma=2., + rho=0.30, + randomizer_scale=1., + equicorrelated=True, + useIP=True, + CI=True) + + coverage_ += cov + length_ += len + print("coverage so far ", coverage_ / (n + 1.)) + print("lengths so far ", length_ / (n + 1.)) + print("iteration completed ", n + 1) + +if __name__ == "__main__": + main(nsim=1, CI = False) + diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index b6c1c8ddb..62e7b783f 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -250,4 +250,124 @@ def prior(target_parameter): return samples +def test_hiv_data(nsample=10000, + nburnin=500, + level=0.90, + split_proportion=0.50, + seedn=1): + np.random.seed(seedn) + + alpha = (1 - level) / 2 + Z_quantile = ndist.ppf(1 - alpha) + + X, Y, _ = HIV_NRTI(standardize=True) + Y *= 15 + n, p = X.shape + X /= np.sqrt(n) + + ols_fit = np.linalg.pinv(X).dot(Y) + _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) + + const = split_lasso.gaussian + + dispersion = _sigma ** 2 + + W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma + + conv = const(X, + Y, + W, + proportion=split_proportion) + + signs = conv.fit() + nonzero = signs != 0 + + (observed_target, + cov_target, + regress_target_score, + dispersion, + alternatives) = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + mle, inverse_info = conv.selective_MLE(observed_target, + cov_target, + regress_target_score, + dispersion, + level=level, + solve_args={'tol': 1.e-12})[:2] + + approx_inf = conv.approximate_grid_inference(observed_target, + cov_target, + regress_target_score, + dispersion) + + posterior_inf = conv.posterior(observed_target, + cov_target, + regress_target_score, + dispersion=dispersion) + + samples_langevin = langevin_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin, + step=1.) + + lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) + upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) + + samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin) + + lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) + upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) + + naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) + naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) + naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), + naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T + + X_split = X[~conv._selection_idx, :] + Y_split = Y[~conv._selection_idx] + split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) + split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) + split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), + split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T + + print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", + np.mean(upper_langevin - lower_langevin), + np.mean(upper_gibbs - lower_gibbs), + np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), + np.mean(mle['upper_confidence'] - mle['lower_confidence']), + np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) + ) + + print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) + + print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) + + scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) + output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, + 'Langevin_upper_credible': upper_langevin, + 'Gibbs_lower_credible': lower_gibbs, + 'Gibbs_upper_credible': upper_gibbs, + 'MLE_lower_confidence': mle['lower_confidence'], + 'MLE_upper_confidence': mle['upper_confidence'], + 'approx_lower_confidence': approx_inf['lower_confidence'], + 'approx_upper_confidence': approx_inf['upper_confidence'], + 'Split_lower_confidence': split_intervals[:, 0], + 'Split_upper_confidence': split_intervals[:, 1], + 'Naive_lower_confidence': naive_intervals[:, 0], + 'Naive_upper_confidence': naive_intervals[:, 1] + }) + + return output, scale_interval, _sigma + + +if __name__ == "__main__": + #test_hiv_data(split_proportion=0.50) + test_coverage(nsim=1) + + From 5fb9c81def15ba009920b4b33dc7c3eb9f456e8d Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Wed, 8 Sep 2021 21:42:23 -0400 Subject: [PATCH 142/187] commit changes before switch --- selectinf/randomized/posterior_inference.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 4284f5211..a3bdbd39e 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -1,12 +1,19 @@ from __future__ import division, print_function import numpy as np +import typing + from scipy.stats import norm as ndist, invgamma from scipy.linalg import fractional_matrix_power from ..algorithms.barrier_affine import solve_barrier_affine_py +class PosteriorAtt(typing.NamedTuple): + + logPosterior: float + grad_logPosterior: np.ndarray + class posterior(object): """ Parameters @@ -124,8 +131,11 @@ def log_posterior(self, log_prior, grad_prior = self.prior(target_parameter) - return (self.dispersion * (log_lik - self.log_ref) / sigmasq + log_prior, - self.dispersion * grad_lik / sigmasq + grad_prior) + log_posterior = self.dispersion * (log_lik - self.log_ref) / sigmasq + log_prior + grad_log_posterior = self.dispersion * grad_lik / sigmasq + grad_prior + + return PosteriorAtt(log_posterior, + grad_log_posterior) ### Private method @@ -228,7 +238,7 @@ def gibbs_sampler(selective_posterior, scale_update_sq = invgamma.rvs(a=(0.1 + selective_posterior.ntarget + selective_posterior.ntarget / 2), - scale=0.1 - ((scale_update ** 2) * sampler.posterior_[0]), + scale=0.1 - ((scale_update ** 2) * sampler.posterior_.logPosterior), size=1) scale_samples[i] = np.sqrt(scale_update_sq) sampler.scaling = np.sqrt(scale_update_sq) @@ -269,7 +279,7 @@ def __next__(self): while True: self.posterior_ = self.gradient_map(self.state, self.scaling) _proposal = self.proposal_sqrt.dot(self._noise.rvs(self._shape)) - candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_[1]) + candidate = (self.state + self.stepsize * self.proposal_scale.dot(self.posterior_.grad_logPosterior) + np.sqrt(2.) * _proposal * self._sqrt_step) if not np.all(np.isfinite(self.gradient_map(candidate, self.scaling)[1])): From 68b1aa42340273330da776c4de1efe59d9020ab2 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 21 Sep 2021 12:37:04 -0400 Subject: [PATCH 143/187] fix alignment of _setup_sampler --- selectinf/randomized/query.py | 104 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index a7d1908f9..b2cd82373 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -106,59 +106,59 @@ def fit(self, perturb=None): # Private methods - def _setup_sampler(self, - linear_part, - offset, - opt_linear, - observed_subgrad, - dispersion=1): - - A, b = linear_part, offset - if not np.all(A.dot(self.observed_opt_state) - b <= 0): - raise ValueError('constraints not satisfied') - - (cond_mean, - cond_cov, - cond_precision, - regress_opt, - M1, - M2, - M3) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion=dispersion) - - def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad - if score.ndim == 1: - mean_term = regress_opt.dot(score.T + u).T - else: - mean_term = regress_opt.dot(score.T + u[:, None]).T - arg = opt - mean_term - return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - - log_density = functools.partial(log_density, - regress_opt, + def _setup_sampler(self, + linear_part, + offset, + opt_linear, + observed_subgrad, + dispersion=1): + + A, b = linear_part, offset + if not np.all(A.dot(self.observed_opt_state) - b <= 0): + raise ValueError('constraints not satisfied') + + (cond_mean, + cond_cov, + cond_precision, + regress_opt, + M1, + M2, + M3) = self._setup_implied_gaussian(opt_linear, observed_subgrad, - cond_precision) - - self.cond_mean, self.cond_cov = cond_mean, cond_cov - - affine_con = constraints(A, - b, - mean=cond_mean, - covariance=cond_cov) - - self.sampler = affine_gaussian_sampler(affine_con, - self.observed_opt_state, - self.observed_score_state, - log_density, - regress_opt, # not needed? - observed_subgrad, - opt_linear, # L - M1, - M2, - M3, - selection_info=self.selection_variable, - useC=self.useC) + dispersion=dispersion) + + def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad + if score.ndim == 1: + mean_term = regress_opt.dot(score.T + u).T + else: + mean_term = regress_opt.dot(score.T + u[:, None]).T + arg = opt - mean_term + return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) + + log_density = functools.partial(log_density, + regress_opt, + observed_subgrad, + cond_precision) + + self.cond_mean, self.cond_cov = cond_mean, cond_cov + + affine_con = constraints(A, + b, + mean=cond_mean, + covariance=cond_cov) + + self.sampler = affine_gaussian_sampler(affine_con, + self.observed_opt_state, + self.observed_score_state, + log_density, + regress_opt, # not needed? + observed_subgrad, + opt_linear, # L + M1, + M2, + M3, + selection_info=self.selection_variable, + useC=self.useC) def _setup_implied_gaussian(self, opt_linear, From 2ac3f270f82a949388025eeda90f2bf7b1f40e24 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 3 Oct 2021 17:33:43 -0400 Subject: [PATCH 144/187] added setup_inference post fit: can pass dispersion argument --- selectinf/randomized/lasso.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 6beb26dc0..ad4c1ac57 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -220,22 +220,30 @@ def signed_basis_vector(p, j, s): A_scaling = -np.identity(num_opt_var) b_scaling = np.zeros(num_opt_var) - self._setup_sampler_data = (A_scaling[:active.sum()], - b_scaling[:active.sum()], - opt_linear, - self.observed_subgrad) - #### to be fixed -- set the cov_score here without dispersion self._unscaled_cov_score = _hessian - ##### - - if num_opt_var > 0: - self._setup_sampler(*self._setup_sampler_data) + self.num_opt_var = num_opt_var + + self.A_scaling = A_scaling + self.b_scaling = b_scaling + self.active = active return active_signs + def setup_inference(self, + dispersion): + + self._setup_sampler_data = (self.A_scaling[:self.active.sum()], + self.b_scaling[:self.active.sum()], + self.opt_linear, + self.observed_subgrad, + dispersion) + + if self.num_opt_var > 0: + self._setup_sampler(*self._setup_sampler_data) + def _solve_randomized_problem(self, perturb=None, solve_args={'tol': 1.e-12, 'min_its': 50}): From 990152c04f446785344d88d90a39e18bc0b52ce1 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 3 Oct 2021 23:27:14 -0400 Subject: [PATCH 145/187] update tests-- selected targets (mle) --- selectinf/randomized/query.py | 15 +++++++-------- .../randomized/tests/test_selective_MLE_high.py | 6 ++++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index b2cd82373..c75bee97e 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -314,8 +314,8 @@ def selective_MLE(self, return self.sampler.selective_MLE(target_spec, self.observed_opt_state, - level=level, - solve_args=solve_args) + solve_args=solve_args, + level=level) def posterior(self, target_spec, @@ -1363,7 +1363,7 @@ def selective_MLE(target_spec, (observed_target, cov_target, regress_target_score) = target_spec[:3] - + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') @@ -1415,13 +1415,12 @@ def selective_MLE(target_spec, pvalues = 2 * np.minimum(pvalues, 1 - pvalues) - alpha = 1 - level + alpha = 1. - level + quantile = ndist.ppf(1 - alpha / 2.) - intervals = np.vstack([final_estimator - - quantile * np.sqrt(np.diag(observed_info_mean)), - final_estimator + - quantile * np.sqrt(np.diag(observed_info_mean))]).T + intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), + final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 552d2b9ce..389951145 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -131,17 +131,19 @@ def test_selected_targets(n=2000, nonzero = signs != 0 print("dimensions", n, p, nonzero.sum()) + if nonzero.sum() > 0: dispersion = None if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + conv.setup_inference(dispersion=dispersion) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) - result = conv.selective_MLE(target_spec, - dispersion)[0] + result = conv.selective_MLE(target_spec)[0] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) From add188869574816fd663a26889d85ee9ddf03b48 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 09:28:11 -0400 Subject: [PATCH 146/187] removing dispersion from list returned by target forming functions --- selectinf/base.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/selectinf/base.py b/selectinf/base.py index b6fbc182a..3c8100cf5 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -53,7 +53,7 @@ class TargetSpec(typing.NamedTuple): cov_target : np.ndarray regress_target_score : np.ndarray alternatives : list - dispersion : float = 1 + #dispersion : float = 1 def selected_targets(loglike, solution, @@ -99,8 +99,7 @@ def selected_targets(loglike, return TargetSpec(observed_target, cov_target * dispersion, regress_target_score, - alternatives, - dispersion) + alternatives) def full_targets(loglike, solution, @@ -143,8 +142,7 @@ def full_targets(loglike, return TargetSpec(observed_target, cov_target * dispersion, regress_target_score, - alternatives, - dispersion) + alternatives) def debiased_targets(loglike, solution, @@ -213,8 +211,7 @@ def debiased_targets(loglike, return TargetSpec(observed_target, cov_target * dispersion, Qinv_hat, - alternatives, - dispersion) + alternatives) def form_targets(target, loglike, From afa029e373a4c907ea3a59bbd6e22f9460cf5b4e Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 09:32:31 -0400 Subject: [PATCH 147/187] give dispersion as an argument for posterior class: needs it for sampling when sigma is unknown --- selectinf/randomized/posterior_inference.py | 5 ++--- selectinf/randomized/query.py | 2 ++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index a3bdbd39e..194b6c6b4 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -37,6 +37,7 @@ class posterior(object): def __init__(self, query, target_spec, + dispersion, prior, solve_args={'tol': 1.e-12}): @@ -44,9 +45,7 @@ def __init__(self, (observed_target, cov_target, - regress_target_score, - _, - dispersion) = target_spec + regress_target_score) = target_spec[:3] linear_part = query.sampler.affine_con.linear_part offset = query.sampler.affine_con.offset diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index c75bee97e..0c943604d 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -319,6 +319,7 @@ def selective_MLE(self, def posterior(self, target_spec, + dispersion=1, prior=None, solve_args={'tol': 1.e-12}): """ @@ -350,6 +351,7 @@ def prior(target_parameter): return posterior(self, target_spec, + dispersion, prior, solve_args=solve_args) From e37d0c2a28b5dcc0e1712c868bffd144d4e9f45b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 10:04:09 -0400 Subject: [PATCH 148/187] add setup_inference to split_lasso, lasso --- selectinf/randomized/lasso.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index ad4c1ac57..e1774f2c0 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -238,11 +238,11 @@ def setup_inference(self, self._setup_sampler_data = (self.A_scaling[:self.active.sum()], self.b_scaling[:self.active.sum()], self.opt_linear, - self.observed_subgrad, - dispersion) + self.observed_subgrad) if self.num_opt_var > 0: - self._setup_sampler(*self._setup_sampler_data) + self._setup_sampler(*self._setup_sampler_data, + dispersion=dispersion) def _solve_randomized_problem(self, perturb=None, @@ -731,19 +731,31 @@ def fit(self, n, p = X.shape df_fit = len(self.selection_variable['variables']) - dispersion = 2 * (self.loglike.smooth_objective(self._beta_full, + dispersion = 2 * (self.loglike.smooth_objective(self._beta_full, 'func') / - (n - df_fit)) + (n - df_fit)) - # run setup again after - # estimating dispersion - - if df_fit > 0: - self._setup_sampler(*self._setup_sampler_data, - dispersion=dispersion) + self.df_fit = df_fit + self.dispersion = dispersion + # run setup again after + # estimating dispersion return signs + + def setup_inference(self, + dispersion=None): + + if self.df_fit>0: + + if dispersion is None: + self._setup_sampler(*self._setup_sampler_data, + dispersion=self.dispersion) + + else: + self._setup_sampler(*self._setup_sampler_data, + dispersion=dispersion) + def _setup_implied_gaussian(self, opt_linear, observed_subgrad, From 66e4e4075b6582e1fb51410a3c21710bd4255f7a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 11:01:34 -0400 Subject: [PATCH 149/187] updated tests: for mle --- selectinf/bayesian/utils.py | 73 +++++++++++ selectinf/randomized/lasso.py | 2 +- .../tests/test_selective_MLE_high.py | 41 ++++-- .../bayesian/selection_aware_posterior.py | 124 ++++++++++++++++++ 4 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 selectinf/bayesian/utils.py create mode 100644 selection/bayesian/selection_aware_posterior.py diff --git a/selectinf/bayesian/utils.py b/selectinf/bayesian/utils.py new file mode 100644 index 000000000..90e51ae8a --- /dev/null +++ b/selectinf/bayesian/utils.py @@ -0,0 +1,73 @@ +import numpy as np +from scipy.linalg import fractional_matrix_power +from scipy.stats import norm as ndist + +class langevin(object): + + def __init__(self, + initial_condition, + gradient_map, + stepsize, + proposal_scale): + + (self.state, + self.gradient_map, + self.stepsize) = (np.copy(initial_condition), + gradient_map, + stepsize) + self._shape = self.state.shape[0] + self._sqrt_step = np.sqrt(self.stepsize) + self._noise = ndist(loc=0,scale=1) + self.sample = np.copy(initial_condition) + + self.proposal_scale = proposal_scale + self.proposal_sqrt = fractional_matrix_power(self.proposal_scale, 0.5) + + def __iter__(self): + return self + + def next(self): + return self.__next__() + + def __next__(self): + while True: + + gradient_posterior, draw, _ = self.gradient_map(self.state) + + candidate = (self.state + self.stepsize * self.proposal_scale.dot(gradient_posterior) + + np.sqrt(2.) * (self.proposal_sqrt.dot(self._noise.rvs(self._shape))) * self._sqrt_step) + + if not np.all(np.isfinite(self.gradient_map(candidate)[0])): + self.stepsize *= 0.5 + self._sqrt_step = np.sqrt(self.stepsize) + else: + self.state[:] = candidate + self.sample[:] = draw + #print(" next sample ", self.state[:], self.sample[:]) + break + + return self.sample + +def langevin_sampler(posterior, + nsample=2000, + nburnin=100, + step_frac=0.3, + start=None): + + if start is None: + start = posterior.initialize_sampler(posterior.initial_estimate) + + state = np.append(start, np.ones(posterior.target_size)) + stepsize = 1. / (step_frac * (2 * posterior.target_size)) + proposal_scale = np.identity(int(2 * posterior.target_size)) + sampler = langevin(state, posterior.gradient_log_likelihood, stepsize, proposal_scale) + + samples = np.zeros((nsample, 2 * posterior.target_size)) + + for i, sample in enumerate(sampler): + samples[i, :] = sampler.sample.copy() + print(" next sample ", i, samples[i, :]) + if i == nsample - 1: + break + + return samples[nburnin:, :] \ No newline at end of file diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index e1774f2c0..6757ffa0e 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -751,7 +751,7 @@ def setup_inference(self, if dispersion is None: self._setup_sampler(*self._setup_sampler_data, dispersion=self.dispersion) - + else: self._setup_sampler(*self._setup_sampler_data, dispersion=dispersion) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 389951145..a55a86686 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -62,7 +62,7 @@ def test_full_targets(n=200, if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) else: - dispersion = None + dispersion = np.linalg.norm(Y - X[:,nonzero].dot(np.linalg.pinv(X[:,nonzero]).dot(Y))) ** 2 / (n - nonzero.sum()) if n > p: target_spec = full_targets(conv.loglike, @@ -76,6 +76,8 @@ def test_full_targets(n=200, penalty=conv.penalty, dispersion=dispersion) + conv.setup_inference(dispersion=dispersion) + result = conv.selective_MLE(target_spec)[0] pval = result['pvalue'] @@ -133,9 +135,11 @@ def test_selected_targets(n=2000, if nonzero.sum() > 0: - dispersion = None + if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = np.linalg.norm(Y - X[:,nonzero].dot(np.linalg.pinv(X[:,nonzero]).dot(Y))) ** 2 / (n - nonzero.sum()) conv.setup_inference(dispersion=dispersion) @@ -173,6 +177,8 @@ def test_instance(): print("check ", M) dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + L.setup_inference(dispersion=dispersion) + target_spec = selected_targets(L.loglike, L.observed_soln, features=M, @@ -180,8 +186,7 @@ def test_instance(): print("check shapes", target_spec.observed_target.shape, E.sum()) - result = L.selective_MLE(target_spec, - dispersion)[0] + result = L.selective_MLE(target_spec)[0] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -231,16 +236,18 @@ def test_selected_targets_disperse(n=500, print("dimensions", n, p, nonzero.sum()) if nonzero.sum() > 0: - dispersion = None if full_dispersion: dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + else: + dispersion = np.linalg.norm(Y - X[:,nonzero].dot(np.linalg.pinv(X[:,nonzero]).dot(Y))) ** 2 / (n - nonzero.sum()) + + conv.setup_inference(dispersion=dispersion) target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) - result = conv.selective_MLE(target_spec, - dispersion)[0] + result = conv.selective_MLE(target_spec)[0] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -290,6 +297,8 @@ def test_logistic(n=2000, if nonzero.sum() > 0: + conv.setup_inference(dispersion=1) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) @@ -340,6 +349,8 @@ def test_logistic_split(n=2000, if nonzero.sum() > 0: + conv.setup_inference(dispersion=1) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) @@ -389,6 +400,7 @@ def test_poisson(n=2000, print("dimensions", n, p, nonzero.sum()) if nonzero.sum() > 0: + conv.setup_inference(dispersion=1) target_spec = selected_targets(conv.loglike, conv.observed_soln, @@ -440,6 +452,8 @@ def test_poisson_split(n=2000, if nonzero.sum() > 0: + conv.setup_inference(dispersion=1) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) @@ -493,6 +507,8 @@ def test_cox(n=2000, cox_full = rr.glm.cox(X, T, S) full_hess = cox_full.hessian(conv.observed_soln) + conv.setup_inference(dispersion=1) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) @@ -546,6 +562,8 @@ def test_cox_split(n=2000, cox_full = rr.glm.cox(X, T, S) full_hess = cox_full.hessian(conv.observed_soln) + conv.setup_inference(dispersion=1) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) @@ -606,15 +624,18 @@ def test_scale_invariant_split(n=200, print('feature_weights', conv.feature_weights[0] / scale) dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + conv.setup_inference(dispersion=dispersion) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) - print('dispersion', target_spec.dispersion/scale**2) + #print('dispersion', target_spec.dispersion/scale**2) print('target', target_spec.observed_target[0]/scale) print('cov_target', target_spec.cov_target[0,0]/scale**2) print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) - + + result = conv.selective_MLE(target_spec)[0] print(result['MLE'] / scale) @@ -682,6 +703,8 @@ def test_scale_invariant(n=200, print('perturb', conv._initial_omega[0] / scale) dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p) + conv.setup_inference(dispersion=dispersion) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) diff --git a/selection/bayesian/selection_aware_posterior.py b/selection/bayesian/selection_aware_posterior.py new file mode 100644 index 000000000..8e00d3220 --- /dev/null +++ b/selection/bayesian/selection_aware_posterior.py @@ -0,0 +1,124 @@ +import numpy as np, sys +from selection.randomized.selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from scipy.stats import norm as ndist + +class posterior_inference(): + + + def __init__(self, + observed_target, + cov_target, + cov_target_score, + feasible_point, + cond_mean, + cond_cov, + logdens_linear, + linear_part, + offset, + ini_estimate): + + self.observed_target = observed_target + self.cov_target = cov_target + self.cov_target_score = cov_target_score + + self.feasible_point = feasible_point + self.cond_mean = cond_mean + self.cond_cov = cond_cov + self.target_size = cond_cov.shape[0] + self.logdens_linear = logdens_linear + self.linear_part = linear_part + self.offset = offset + self.ini_estimate = ini_estimate + + def prior(self, target_parameter, var_parameter, lam): + + std_parameter = np.sqrt(var_parameter) + grad_prior_par = -np.true_divide(target_parameter, var_parameter) + grad_prior_std = np.true_divide(target_parameter**2. , 2.*(var_parameter**2))- (lam/2.)-1./(2.*var_parameter) + log_prior = -(np.linalg.norm(target_parameter)**2.) / (2.*var_parameter) - (lam * (np.linalg.norm(std_parameter)**2)/2.)-np.log(std_parameter) + return grad_prior_par, grad_prior_std, log_prior + + def det_initial_point(self, initial_soln, solve_args={'tol':1.e-12}): + + if np.asarray(self.observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + observed_target = np.atleast_1d(self.observed_target) + prec_target = np.linalg.inv(self.cov_target) + + target_lin = - self.logdens_linear.dot(self.cov_target_score.T.dot(prec_target)) + target_offset = self.cond_mean - target_lin.dot(observed_target) + + prec_opt = np.linalg.inv(self.cond_cov) + mean_opt = target_lin.dot(initial_soln) + target_offset + conjugate_arg = prec_opt.dot(mean_opt) + + solver = solve_barrier_affine_py + + val, soln, hess = solver(conjugate_arg, + prec_opt, + self.feasible_point, + self.linear_part, + self.offset, + **solve_args) + + initial_point = initial_soln + self.cov_target.dot(target_lin.T.dot(prec_opt.dot(mean_opt - soln))) + return initial_point + + def gradient_log_likelihood(self, parameters, solve_args={'tol':1.e-15}): + + npar = self.target_size + target_parameter = parameters[:npar] + var_parameter = parameters[npar:] + if np.asarray(self.observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + observed_target = np.atleast_1d(self.observed_target) + prec_target = np.linalg.inv(self.cov_target) + + target_lin = - self.logdens_linear.dot(self.cov_target_score.T.dot(prec_target)) + target_offset = self.cond_mean - target_lin.dot(observed_target) + + prec_opt = np.linalg.inv(self.cond_cov) + mean_opt = target_lin.dot(target_parameter) + target_offset + conjugate_arg = prec_opt.dot(mean_opt) + + solver = solve_barrier_affine_C + + val, soln, hess = solver(conjugate_arg, + prec_opt, + self.feasible_point, + self.linear_part, + self.offset, + **solve_args) + + reparam = target_parameter + self.cov_target.dot(target_lin.T.dot(prec_opt.dot(mean_opt - soln))) + neg_normalizer = (target_parameter - reparam).T.dot(prec_target).dot(target_parameter - reparam)/2. \ + + val + mean_opt.T.dot(prec_opt).dot(mean_opt) / 2. + + grad_barrier = np.diag(2. / ((1. + soln) ** 3.) - 2. / (soln ** 3.)) + + L = target_lin.T.dot(prec_opt) + N = L.dot(hess) + jacobian = (np.identity(observed_target.shape[0]) + self.cov_target.dot(L).dot(target_lin)) - \ + self.cov_target.dot(N).dot(L.T) + + log_lik = -((observed_target - reparam).T.dot(prec_target).dot(observed_target - reparam)) / 2. + neg_normalizer \ + + np.log(np.linalg.det(jacobian)) + + grad_lik = jacobian.T.dot(prec_target).dot(observed_target) + grad_neg_normalizer = -jacobian.T.dot(prec_target).dot(target_parameter) + + opt_num = self.cond_cov.shape[0] + grad_jacobian = np.zeros(opt_num) + A = np.linalg.inv(jacobian).dot(self.cov_target).dot(N) + for j in range(opt_num): + M = grad_barrier.dot(np.diag(N.T[:, j])) + grad_jacobian[j] = np.trace(A.dot(M).dot(N.T)) + + prior_info = self.hierarchical_prior(reparam, var_parameter, lam=0.01) + return np.append(grad_lik + grad_neg_normalizer + grad_jacobian + jacobian.T.dot(prior_info[0]), prior_info[1]),\ + np.append(reparam, var_parameter), log_lik + prior_info[2] + + + From 4c665702de3a29c346ff6aa63e42496acefc520f Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 16:07:50 -0400 Subject: [PATCH 150/187] updated tests for posterior inference --- selectinf/randomized/query.py | 2 +- selectinf/randomized/tests/test_posterior.py | 73 +++++++++++--------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 0c943604d..8573cda2b 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -358,7 +358,7 @@ def prior(target_parameter): def approximate_grid_inference(self, target_spec, solve_args={'tol': 1.e-12}, - useIP=False): + useIP=True): """ Parameters diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 62e7b783f..1fc38bd32 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -51,24 +51,29 @@ def test_Langevin(n=500, signs = conv.fit() nonzero = signs != 0 - beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + if nonzero.sum()>0: - target_spec = selected_targets(conv.loglike, - conv.observed_soln, - dispersion=dispersion) + conv.setup_inference(dispersion=dispersion) - posterior_inf = conv.posterior(target_spec) + beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) - samples = langevin_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin) + target_spec = selected_targets(conv.loglike, + conv.observed_soln, + dispersion=dispersion) - lci = np.percentile(samples, 5, axis=0) - uci = np.percentile(samples, 95, axis=0) - coverage = (lci < beta_target) * (uci > beta_target) - length = uci - lci + posterior_inf = conv.posterior(target_spec, + dispersion=dispersion) - return np.mean(coverage), np.mean(length) + samples = langevin_sampler(posterior_inf, + nsample=nsample, + nburnin=nburnin) + + lci = np.percentile(samples, 5, axis=0) + uci = np.percentile(samples, 95, axis=0) + coverage = (lci < beta_target) * (uci > beta_target) + length = uci - lci + + return np.mean(coverage), np.mean(length) @set_seed_iftrue(SET_SEED) @@ -116,13 +121,17 @@ def test_instance(nsample=100, nburnin=50): M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + L.setup_inference(dispersion=dispersion) + target_spec = selected_targets(L.loglike, L.observed_soln, features=M, dispersion=dispersion) + print(target_spec.dispersion, dispersion) - posterior_inf = L.posterior(target_spec) + posterior_inf = L.posterior(target_spec, + dispersion=dispersion) samples = langevin_sampler(posterior_inf, nsample=nsample, @@ -166,6 +175,8 @@ def test_flexible_prior1(nsample=100, M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + L.setup_inference(dispersion=dispersion) + target_spec = selected_targets(L.loglike, L.observed_soln, features=M, @@ -185,6 +196,7 @@ def prior(target_parameter): Z1 = np.random.standard_normal() posterior_inf1 = L.posterior(target_spec, + dispersion=dispersion, prior=prior) W1 = np.random.standard_normal() @@ -226,6 +238,8 @@ def test_flexible_prior2(nsample=1000, nburnin=50): M[-3:] = 1 dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) + L.setup_inference(dispersion=dispersion) + target_spec = selected_targets(L.loglike, L.observed_soln, features=M, @@ -239,6 +253,7 @@ def prior(target_parameter): return log_prior, grad_prior posterior_inf = L.posterior(target_spec, + dispersion=dispersion, prior=prior) adaptive_proposal = np.linalg.inv(np.linalg.inv(posterior_inf.inverse_info) + @@ -282,30 +297,20 @@ def test_hiv_data(nsample=10000, signs = conv.fit() nonzero = signs != 0 - (observed_target, - cov_target, - regress_target_score, - dispersion, - alternatives) = selected_targets(conv.loglike, - conv._W, - nonzero, - dispersion=dispersion) - - mle, inverse_info = conv.selective_MLE(observed_target, - cov_target, - regress_target_score, - dispersion, + conv.setup_inference(dispersion=dispersion) + + target_spec = selected_targets(conv.loglike, + conv._W, + nonzero, + dispersion=dispersion) + + mle, inverse_info = conv.selective_MLE(target_spec, level=level, solve_args={'tol': 1.e-12})[:2] - approx_inf = conv.approximate_grid_inference(observed_target, - cov_target, - regress_target_score, - dispersion) + approx_inf = conv.approximate_grid_inference(target_spec) - posterior_inf = conv.posterior(observed_target, - cov_target, - regress_target_score, + posterior_inf = conv.posterior(target_spec, dispersion=dispersion) samples_langevin = langevin_sampler(posterior_inf, From a828983296a4dde82b64a4f6e055d38aaf871740 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 16:10:38 -0400 Subject: [PATCH 151/187] updated tests for approx and exact reference --- selectinf/randomized/tests/test_approx_reference.py | 4 +++- selectinf/randomized/tests/test_exact_reference.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index d0351e017..906b43fc9 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -56,6 +56,8 @@ def test_inf(seedn, if nonzero.sum() > 0: beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + conv.setup_inference(dispersion=dispersion) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) @@ -97,7 +99,7 @@ def main(nsim=300, CI = False): rho=0.30, randomizer_scale=1., equicorrelated=True, - useIP=False, + useIP=True, CI=False)) print("iteration completed ", i) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 534e4beaf..39931e75f 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -54,6 +54,8 @@ def test_inf(n=500, if nonzero.sum() > 0: beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) + conv.setup_inference(dispersion=dispersion) + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) From 77b06507d7c25af5c9f91577ba50261206c5c1f8 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 4 Oct 2021 21:58:51 -0400 Subject: [PATCH 152/187] all tests pass --- selectinf/randomized/lasso.py | 19 ++++++++----------- .../randomized/tests/test_approx_reference.py | 4 +--- .../randomized/tests/test_exact_reference.py | 2 +- selectinf/randomized/tests/test_posterior.py | 6 ++---- .../tests/test_selective_MLE_high.py | 2 +- 5 files changed, 13 insertions(+), 20 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 6757ffa0e..1cca12f32 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -226,20 +226,16 @@ def signed_basis_vector(p, j, s): self.num_opt_var = num_opt_var - self.A_scaling = A_scaling - self.b_scaling = b_scaling - self.active = active + self._setup_sampler_data = (A_scaling[:active.sum()], + b_scaling[:active.sum()], + opt_linear, + self.observed_subgrad) return active_signs def setup_inference(self, dispersion): - self._setup_sampler_data = (self.A_scaling[:self.active.sum()], - self.b_scaling[:self.active.sum()], - self.opt_linear, - self.observed_subgrad) - if self.num_opt_var > 0: self._setup_sampler(*self._setup_sampler_data, dispersion=dispersion) @@ -724,29 +720,30 @@ def fit(self, # we need to estimate a dispersion parameter # we then setup up the sampler again + df_fit = len(self.selection_variable['variables']) if self.estimate_dispersion: X, y = self.loglike.data n, p = X.shape - df_fit = len(self.selection_variable['variables']) dispersion = 2 * (self.loglike.smooth_objective(self._beta_full, 'func') / (n - df_fit)) - self.df_fit = df_fit self.dispersion = dispersion # run setup again after # estimating dispersion + self.df_fit = df_fit + return signs def setup_inference(self, dispersion=None): - if self.df_fit>0: + if self.df_fit > 0: if dispersion is None: self._setup_sampler(*self._setup_sampler_data, diff --git a/selectinf/randomized/tests/test_approx_reference.py b/selectinf/randomized/tests/test_approx_reference.py index 906b43fc9..a4b6ec87b 100644 --- a/selectinf/randomized/tests/test_approx_reference.py +++ b/selectinf/randomized/tests/test_approx_reference.py @@ -5,8 +5,7 @@ from ...base import selected_targets from ..approx_reference import approximate_grid_inference -def test_inf(seedn, - n=500, +def test_inf(n=500, p=100, signal_fac=1., s=5, @@ -17,7 +16,6 @@ def test_inf(seedn, useIP=True, CI=False): - np.random.seed(seedn) inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * 2 * np.log(p)) diff --git a/selectinf/randomized/tests/test_exact_reference.py b/selectinf/randomized/tests/test_exact_reference.py index 39931e75f..ad1dee613 100644 --- a/selectinf/randomized/tests/test_exact_reference.py +++ b/selectinf/randomized/tests/test_exact_reference.py @@ -55,7 +55,7 @@ def test_inf(n=500, beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) conv.setup_inference(dispersion=dispersion) - + target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=dispersion) diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 1fc38bd32..3d972a585 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -127,8 +127,6 @@ def test_instance(nsample=100, nburnin=50): L.observed_soln, features=M, dispersion=dispersion) - - print(target_spec.dispersion, dispersion) posterior_inf = L.posterior(target_spec, dispersion=dispersion) @@ -297,10 +295,10 @@ def test_hiv_data(nsample=10000, signs = conv.fit() nonzero = signs != 0 - conv.setup_inference(dispersion=dispersion) + conv.setup_inference() target_spec = selected_targets(conv.loglike, - conv._W, + conv.observed_soln, nonzero, dispersion=dispersion) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index a55a86686..947e75bcd 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -709,7 +709,7 @@ def test_scale_invariant(n=200, conv.observed_soln, dispersion=dispersion) - print('dispersion', target_spec.dispersion/scale**2) + #print('dispersion', target_spec.dispersion/scale**2) print('target', target_spec.observed_target[0]/scale) print('cov_target', target_spec.cov_target[0,0]/scale**2) print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) From 7fd440186b0dcab2aa147c2a65613228f161d4ce Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 31 Oct 2021 20:28:36 -0400 Subject: [PATCH 153/187] deleted unused methods for sampling from query --- selectinf/randomized/query.py | 851 +--------------------------------- 1 file changed, 22 insertions(+), 829 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 8573cda2b..f9237e562 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1,21 +1,9 @@ -import functools -from itertools import product - import numpy as np import pandas as pd from scipy.stats import norm as ndist -from scipy.optimize import bisect - -from regreg.affine import power_L -import regreg.api as rr -from ..distributions.api import discrete_family -from ..constraints.affine import (sample_from_constraints, - constraints) +from ..constraints.affine import constraints from ..algorithms.barrier_affine import solve_barrier_affine_py -from ..base import (selected_targets, - full_targets, - debiased_targets) from .posterior_inference import posterior from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C @@ -87,7 +75,6 @@ def solve(self): class gaussian_query(query): - useC = True """ A class with Gaussian perturbation to the objective -- @@ -96,8 +83,6 @@ class gaussian_query(query): def fit(self, perturb=None): - p = self.nfeature - # take a new perturbation if supplied if perturb is not None: self._initial_omega = perturb @@ -114,6 +99,7 @@ def _setup_sampler(self, dispersion=1): A, b = linear_part, offset + if not np.all(A.dot(self.observed_opt_state) - b <= 0): raise ValueError('constraints not satisfied') @@ -127,19 +113,6 @@ def _setup_sampler(self, observed_subgrad, dispersion=dispersion) - def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad - if score.ndim == 1: - mean_term = regress_opt.dot(score.T + u).T - else: - mean_term = regress_opt.dot(score.T + u[:, None]).T - arg = opt - mean_term - return - 0.5 * np.sum(arg * cond_prec.dot(arg.T).T, 1) - - log_density = functools.partial(log_density, - regress_opt, - observed_subgrad, - cond_precision) - self.cond_mean, self.cond_cov = cond_mean, cond_cov affine_con = constraints(A, @@ -147,18 +120,9 @@ def log_density(regress_opt, u, cond_prec, opt, score): # u == subgrad mean=cond_mean, covariance=cond_cov) - self.sampler = affine_gaussian_sampler(affine_con, - self.observed_opt_state, - self.observed_score_state, - log_density, - regress_opt, # not needed? - observed_subgrad, - opt_linear, # L - M1, - M2, - M3, - selection_info=self.selection_variable, - useC=self.useC) + self.affine_con = affine_con + self.opt_linear = opt_linear + self.observed_subgrad = observed_subgrad def _setup_implied_gaussian(self, opt_linear, @@ -201,121 +165,26 @@ def _setup_implied_gaussian(self, M2, M3) - def summary(self, - target_spec, - opt_sample=None, - target_sample=None, - parameter=None, - level=0.9, - ndraw=10000, - burnin=2000, - compute_intervals=False): - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated regression coefficient of target on score. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - level : float - Confidence level. - ndraw : int (optional) - Defaults to 1000. - burnin : int (optional) - Defaults to 1000. - compute_intervals : bool - Compute confidence intervals? - """ - - if parameter is None: - parameter = np.zeros_like(target_spec.observed_target) - - if opt_sample is None: - opt_sample, logW = self.sampler.sample(ndraw, burnin) - else: - if len(opt_sample) == 1: # only a sample, so weights are 1s - opt_sample = opt_sample[0] - logW = np.zeros(ndraw) - else: - opt_sample, logW = opt_sample - ndraw = opt_sample.shape[0] - - pivots = self.sampler.coefficient_pvalues(target_spec.observed_target, - target_spec.cov_target, - target_spec.regress_target_score, - parameter=parameter, - sample=(opt_sample, logW), - normal_sample=target_sample, - alternatives=target_spec.alternatives) - - if not np.all(parameter == 0): - pvalues = self.sampler.coefficient_pvalues(target_spec.observed_target, - target_spec.cov_target, - target_spec.regress_target_score, - parameter=np.zeros_like(parameter), - sample=(opt_sample, logW), - normal_sample=target_sample, - alternatives=target_spec.alternatives) - else: - pvalues = pivots - - result = pd.DataFrame({'target': target_spec.observed_target, - 'pvalue': pvalues}) - - if compute_intervals: - MLE = self.selective_MLE(target_spec)[0] - MLE_intervals = np.asarray(MLE[['lower_confidence', 'upper_confidence']]) - - intervals = self.sampler.confidence_intervals( - target_spec.observed_target, - target_spec.cov_target, - target_spec.regress_target_score, - sample=(opt_sample, logW), - normal_sample=target_sample, - initial_guess=MLE_intervals, - level=level) - - result.insert(2, 'lower_confidence', intervals[:, 0]) - result.insert(3, 'upper_confidence', intervals[:, 1]) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result - def selective_MLE(self, target_spec, - level=0.9, + level=0.90, solve_args={'tol': 1.e-12}): - """ - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. - level : float, optional - Confidence level. - solve_args : dict, optional - Arguments passed to solver. - """ - return self.sampler.selective_MLE(target_spec, - self.observed_opt_state, - solve_args=solve_args, - level=level) + return selective_MLE(target_spec, + self.observed_opt_state, + self.affine_con.mean, + self.affine_con.covariance, + self.affine_con.linear_part, + self.affine_con.offset, + self.opt_linear, + self.M1, + self.M2, + self.M3, + self.observed_score_state + self.observed_subgrad, + solve_args=solve_args, + level=level, + useC=False) + def posterior(self, target_spec, @@ -380,6 +249,7 @@ def approximate_grid_inference(self, target_spec, solve_args=solve_args, useIP=useIP) + return G.summary(alternatives=target_spec.alternatives) @@ -597,683 +467,6 @@ def confidence_intervals(self, return np.array(limits) -class optimization_sampler(object): - - def __init__(self): - raise NotImplementedError("abstract method") - - def sample(self): - raise NotImplementedError("abstract method") - - def log_cond_density(self, - opt_sample, - target_sample, - transform=None): - """ - Density of opt_sample | target_sample - """ - raise NotImplementedError("abstract method") - - def hypothesis_test(self, - test_stat, - observed_value, - cov_target, - score_cov, - sample_args=(), - sample=None, - parameter=0, - alternative='twosided'): - - ''' - Sample `target` from selective density - using sampler with - gradient map `self.gradient` and - projection map `self.projection`. - Parameters - ---------- - test_stat : callable - Test statistic to evaluate on sample from - selective distribution. - observed_value : float - Observed value of test statistic. - Used in p-value calculation. - sample_args : sequence - Arguments to `self.sample` if sample is None. - sample : np.array (optional) - If not None, assumed to be a sample of shape (-1,) + `self.shape` - representing a sample of the target from parameters. - Allows reuse of the same sample for construction of confidence - intervals, hypothesis tests, etc. If not None, - `ndraw, burnin, stepsize` are ignored. - parameter : np.float (optional) - alternative : ['greater', 'less', 'twosided'] - What alternative to use. - Returns - ------- - pvalue : float - ''' - - if alternative not in ['greater', 'less', 'twosided']: - raise ValueError("alternative should be one of ['greater', 'less', 'twosided']") - - if sample is None: - sample, logW = self.sample(*sample_args) - sample = np.atleast_2d(sample) - - if parameter is None: - parameter = self.reference - - sample_test_stat = np.squeeze(np.array([test_stat(x) for x in sample])) - - target_inv_cov = np.linalg.inv(cov_target) - delta = target_inv_cov.dot(parameter - self.reference) - W = np.exp(sample.dot(delta) + logW) - - family = discrete_family(sample_test_stat, W) - pval = family.cdf(0, observed_value) - - if alternative == 'greater': - return 1 - pval - elif alternative == 'less': - return pval - else: - return 2 * min(pval, 1 - pval) - - def confidence_intervals(self, - observed_target, - cov_target, - score_cov, - sample_args=(), - sample=None, - normal_sample=None, - level=0.9, - initial_guess=None): - ''' - Parameters - ---------- - - observed : np.float - A vector of parameters with shape `self.shape`, - representing coordinates of the target. - sample_args : sequence - Arguments to `self.sample` if sample is None. - sample : np.array (optional) - If not None, assumed to be a sample of shape (-1,) + `self.shape` - representing a sample of the target from parameters `self.reference`. - Allows reuse of the same sample for construction of confidence - intervals, hypothesis tests, etc. - level : float (optional) - Specify the - confidence level. - initial_guess : np.float - Initial guesses at upper and lower limits, optional. - Notes - ----- - Construct selective confidence intervals - for each parameter of the target. - Returns - ------- - intervals : [(float, float)] - List of confidence intervals. - ''' - - if sample is None: - sample, logW = self.sample(*sample_args) - sample = np.vstack([sample] * 5) # why times 5? - logW = np.hstack([logW] * 5) - else: - sample, logW = sample - - ndraw = sample.shape[0] - - _intervals = optimization_intervals([(self, - sample, - logW, - cov_target, - score_cov)], - observed_target, - ndraw, - normal_sample=normal_sample) - - limits = [] - - for i in range(observed_target.shape[0]): - keep = np.zeros_like(observed_target) - keep[i] = 1. - if initial_guess is None: - l, u = _intervals.confidence_interval(keep, level=level) - else: - l, u = _intervals.confidence_interval(keep, level=level, - guess=initial_guess[i]) - limits.append((l, u)) - - return np.array(limits) - - def coefficient_pvalues(self, - observed_target, - cov_target, - score_cov, - parameter=None, - sample_args=(), - sample=None, - normal_sample=None, - alternatives=None): - ''' - Construct selective p-values - for each parameter of the target. - Parameters - ---------- - observed : np.float - A vector of parameters with shape `self.shape`, - representing coordinates of the target. - parameter : np.float (optional) - A vector of parameters with shape `self.shape` - at which to evaluate p-values. Defaults - to `np.zeros(self.shape)`. - sample_args : sequence - Arguments to `self.sample` if sample is None. - sample : np.array (optional) - If not None, assumed to be a sample of shape (-1,) + `self.shape` - representing a sample of the target from parameters `self.reference`. - Allows reuse of the same sample for construction of confidence - intervals, hypothesis tests, etc. - alternatives : list of ['greater', 'less', 'twosided'] - What alternative to use. - Returns - ------- - pvalues : np.float - ''' - - if alternatives is None: - alternatives = ['twosided'] * observed_target.shape[0] - - if sample is None: - sample, logW = self.sample(*sample_args) - else: - sample, logW = sample - ndraw = sample.shape[0] - - if parameter is None: - parameter = np.zeros(observed_target.shape[0]) - - _intervals = optimization_intervals([(self, - sample, - logW, - cov_target, - score_cov)], - observed_target, - ndraw, - normal_sample=normal_sample) - pvals = [] - - for i in range(observed_target.shape[0]): - keep = np.zeros_like(observed_target) - keep[i] = 1. - pvals.append(_intervals.pivot(keep, - candidate=parameter[i], - alternative=alternatives[i])) - - return np.array(pvals) - - def _reconstruct_score_from_target(self, - target_sample, - transform=None): - if transform is not None: - direction, nuisance = transform - score_sample = (np.multiply.outer(target_sample, - direction) + - nuisance[None, :]) - else: - score_sample = target_sample - return score_sample - - -class affine_gaussian_sampler(optimization_sampler): - ''' - Sample from an affine truncated Gaussian - ''' - - def __init__(self, - affine_con, - initial_point, - observed_score_state, - log_cond_density, - regress_opt, - observed_subgrad, - opt_linear, - M1, - M2, - M3, - selection_info=None, - useC=False): - - ''' - Parameters - ---------- - affine_con : `selection.constraints.affine.constraints` - Affine constraints - initial_point : ndarray - Feasible point for affine constraints. - observed_score_state : ndarray - Observed score of convex loss (slightly modified). - Essentially (asymptotically) equivalent - to $\nabla \ell(\beta^*) + - Q(\beta^*)\beta^*$ where $\beta^*$ is population - minimizer. For linear regression, it is always - $-X^Ty$. - log_cond_density : callable - Density of optimization variables given score - regress_opt: ndarray - Regression coefficient of opt on to score - observed_subgrad : ndarray - selection_info : optional - Function of optimization variables that - will be conditioned on. - useC : bool, optional - Use python or C solver. - - ''' - - self.affine_con = affine_con - - self.covariance = self.affine_con.covariance - self.mean = self.affine_con.mean - - self.initial_point = initial_point - self.observed_score_state = observed_score_state - self.selection_info = selection_info - self._log_cond_density = log_cond_density - self.regress_opt = regress_opt - self.observed_subgrad = observed_subgrad - self.useC = useC - self.opt_linear = opt_linear - self.M1, self.M2, self.M3 = M1, M2, M3 - - def log_cond_density(self, - opt_sample, - target_sample, - transform=None): - - if transform is not None: - direction, nuisance = transform - return self._log_density_ray(0, # candidate - # has been added to - # target - direction, - nuisance, - target_sample, - opt_sample) - else: - # target must be in score coordinates - score_sample = target_sample - - # probably should switch - # order of signature - return self._log_cond_density(opt_sample, - score_sample) - - def sample(self, ndraw, burnin): - ''' - Sample `target` from selective density - using projected Langevin sampler with - gradient map `self.gradient` and - projection map `self.projection`. - Parameters - ---------- - ndraw : int - How long a chain to return? - burnin : int - How many samples to discard? - ''' - - _sample = sample_from_constraints(self.affine_con, - self.initial_point, - ndraw=ndraw, - burnin=burnin) - return _sample, np.zeros(_sample.shape[0]) - - def selective_MLE(self, - target_spec, - # initial (observed) value of optimization variables -- - # used as a feasible point. - # precise value used only for independent estimator - observed_soln, - solve_args={'tol': 1.e-12}, - level=0.9): - """ - Selective MLE based on approximation of - CGF. - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. - observed_soln : ndarray - Feasible point for optimization problem. - level : float, optional - Confidence level. - solve_args : dict, optional - Arguments passed to solver. - """ - - return selective_MLE(target_spec, - observed_soln, - self.mean, - self.covariance, - self.affine_con.linear_part, - self.affine_con.offset, - self.opt_linear, - self.M1, - self.M2, - self.M3, - self.observed_score_state + self.observed_subgrad, - solve_args=solve_args, - level=level, - useC=self.useC) - - def _log_density_ray(self, - candidate, - direction, - nuisance, - gaussian_sample, - opt_sample): - - # implicitly caching (opt_sample, gaussian_sample) ? - - if (not hasattr(self, "_direction") or not - np.all(self._direction == direction)): - - regress_opt, subgrad = self.regress_opt, self.observed_subgrad - - if opt_sample.shape[1] == 1: - - prec = 1. / self.covariance[0, 0] - quadratic_term = regress_opt.dot(direction) ** 2 * prec - arg = (opt_sample[:, 0] - - regress_opt.dot(nuisance + subgrad) - - regress_opt.dot(direction) * gaussian_sample) - linear_term = -regress_opt.dot(direction) * prec * arg - constant_term = arg ** 2 * prec - - self._cache = {'linear_term': linear_term, - 'quadratic_term': quadratic_term, - 'constant_term': constant_term} - else: - self._direction = direction.copy() - - # density is a Gaussian evaluated at - # O_i - A(N + (Z_i + theta) * gamma + u) - - # u is observed_subgrad - # A is regress_opt - # Z_i is gaussian_sample[i] (real-valued) - # gamma is direction - # O_i is opt_sample[i] - - # let arg1 = O_i - # let arg2 = A(N+u + Z_i \cdot gamma) - # then it is of the form (arg1 - arg2 - theta * A gamma) - - regress_opt, subgrad = self.regress_opt, self.observed_subgrad - cov = self.covariance - prec = np.linalg.inv(cov) - linear_part = -regress_opt.dot(direction) # -A gamma - - if 1 in opt_sample.shape: - pass # stop3 what's this for? - cov = self.covariance - - quadratic_term = linear_part.T.dot(prec).dot(linear_part) - - arg1 = opt_sample.T - arg2 = -regress_opt.dot(np.multiply.outer(direction, gaussian_sample) + - (nuisance + subgrad)[:, None]) - arg = arg1 + arg2 - linear_term = -regress_opt.T.dot(prec).dot(arg) - constant_term = np.sum(prec.dot(arg) * arg, 0) - - self._cache = {'linear_term': linear_term, - 'quadratic_term': quadratic_term, - 'constant_term': constant_term} - (linear_term, - quadratic_term, - constant_term) = (self._cache['linear_term'], - self._cache['quadratic_term'], - self._cache['constant_term']) - return (-0.5 * candidate ** 2 * quadratic_term - - candidate * linear_term - 0.5 * constant_term) - - -class optimization_intervals(object): - - def __init__(self, - opt_sampling_info, # a sequence of - # (opt_sampler, - # opt_sample, - # opt_logweights, - # cov_target, - # score_cov) objects - # in theory all cov_target - # should be about the same... - observed, - nsample, # how large a normal sample - cov_target=None, - normal_sample=None): - - # not all opt_samples will be of the same size as nsample - # let's repeat them as necessary - - tiled_sampling_info = [] - for (opt_sampler, - opt_sample, - opt_logW, - t_cov, - t_score_cov) in opt_sampling_info: - if opt_sample is not None: - if opt_sample.shape[0] < nsample: - if opt_sample.ndim == 1: - tiled_opt_sample = np.tile(opt_sample, - int(np.ceil(nsample / - opt_sample.shape[0])))[:nsample] - tiled_opt_logW = np.tile(opt_logW, - int(np.ceil(nsample / - opt_logW.shape[0])))[:nsample] - else: - tiled_opt_sample = np.tile(opt_sample, - (int(np.ceil(nsample / - opt_sample.shape[0])), 1))[:nsample] - tiled_opt_logW = np.tile(opt_logW, - (int(np.ceil(nsample / - opt_logW.shape[0])), 1))[:nsample] - else: - tiled_opt_sample = opt_sample[:nsample] - tiled_opt_logW = opt_logW[:nsample] - else: - tiled_sample = None - tiled_sampling_info.append((opt_sampler, - tiled_opt_sample, - tiled_opt_logW, - t_cov, - t_score_cov)) - - self.opt_sampling_info = tiled_sampling_info - self._logden = 0 - for opt_sampler, opt_sample, opt_logW, _, _ in opt_sampling_info: - - self._logden += opt_sampler.log_cond_density( - opt_sample, - opt_sampler.observed_score_state, - transform=None) - self._logden -= opt_logW - if opt_sample.shape[0] < nsample: - self._logden = np.tile(self._logden, - int(np.ceil(nsample / - opt_sample.shape[0])))[:nsample] - - # this is our observed unpenalized estimator - self.observed = observed.copy() - - # average covariances in case they might be different - - if cov_target is None: - self.cov_target = 0 - for _, _, _, cov_target, _ in opt_sampling_info: - self.cov_target += cov_target - self.cov_target /= len(opt_sampling_info) - - if normal_sample is None: - self._normal_sample = np.random.multivariate_normal( - mean=np.zeros(self.cov_target.shape[0]), - cov=self.cov_target, - size=(nsample,)) - else: - self._normal_sample = normal_sample - - def pivot(self, - linear_func, - candidate, - alternative='twosided'): - ''' - alternative : ['greater', 'less', 'twosided'] - What alternative to use. - Returns - ------- - pvalue : np.float - ''' - - if alternative not in ['greater', 'less', 'twosided']: - raise ValueError("alternative should be one of ['greater', 'less', 'twosided']") - - observed_stat = self.observed.dot(linear_func) - sample_stat = self._normal_sample.dot(linear_func) - - cov_target = linear_func.dot(self.cov_target.dot(linear_func)) - - nuisance = [] - translate_dirs = [] - - for (opt_sampler, - opt_sample, - _, - _, - regress_target_score) in self.opt_sampling_info: - cur_score_cov = linear_func.dot(regress_target_score) - - # cur_nuisance is in the view's score coordinates - cur_nuisance = opt_sampler.observed_score_state - cur_score_cov * observed_stat / cov_target - nuisance.append(cur_nuisance) - translate_dirs.append(cur_score_cov / cov_target) - - weights = self._weights(sample_stat, # normal sample - candidate, # candidate value - nuisance, # nuisance sufficient stats for each view - translate_dirs) # points will be moved like sample * regress_target_score - - pivot = np.mean((sample_stat + candidate <= observed_stat) * weights) / np.mean(weights) - - if alternative == 'twosided': - return 2 * min(pivot, 1 - pivot) - elif alternative == 'less': - return pivot - else: - return 1 - pivot - - def confidence_interval(self, - linear_func, - level=0.90, - how_many_sd=20, - guess=None): - - sample_stat = self._normal_sample.dot(linear_func) - observed_stat = self.observed.dot(linear_func) - - def _rootU(gamma): - return self.pivot(linear_func, - observed_stat + gamma, - alternative='less') - (1 - level) / 2. - - def _rootL(gamma): - return self.pivot(linear_func, - observed_stat + gamma, - alternative='less') - (1 + level) / 2. - - if guess is None: - grid_min, grid_max = -how_many_sd * np.std(sample_stat), how_many_sd * np.std(sample_stat) - upper = bisect(_rootU, grid_min, grid_max) - lower = bisect(_rootL, grid_min, grid_max) - - else: - delta = 0.5 * (guess[1] - guess[0]) - - # find interval bracketing upper solution - count = 0 - while True: - Lu, Uu = guess[1] - delta, guess[1] + delta - valU = _rootU(Uu) - valL = _rootU(Lu) - if valU * valL < 0: - break - delta *= 2 - count += 1 - upper = bisect(_rootU, Lu, Uu) - - # find interval bracketing lower solution - count = 0 - while True: - Ll, Ul = guess[0] - delta, guess[0] + delta - valU = _rootL(Ul) - valL = _rootL(Ll) - if valU * valL < 0: - break - delta *= 2 - count += 1 - lower = bisect(_rootL, Ll, Ul) - return lower + observed_stat, upper + observed_stat - - # Private methods - - def _weights(self, - stat_sample, - candidate, - nuisance, - translate_dirs): - - # Here we should loop through the views - # and move the score of each view - # for each projected (through linear_func) normal sample - # using the linear decomposition - - # We need access to the map that takes observed_score for each view - # and constructs the full randomization -- this is the reconstruction map - # for each view - - # The data state for each view will be set to be N_i + A_i \hat{\theta}_i - # where N_i is the nuisance sufficient stat for the i-th view's - # data with respect to \hat{\theta} and N_i will not change because - # it depends on the observed \hat{\theta} and observed score of i-th view - - # In this function, \hat{\theta}_i will change with the Monte Carlo sample - - score_sample = [] - _lognum = 0 - for i, opt_info in enumerate(self.opt_sampling_info): - opt_sampler, opt_sample = opt_info[:2] - - _lognum += opt_sampler.log_cond_density(opt_sample, - stat_sample + candidate, - transform= - (translate_dirs[i], - nuisance[i])) - - _logratio = _lognum - self._logden - _logratio -= _logratio.max() - - return np.exp(_logratio) - - def naive_confidence_intervals(diag_cov, observed, level=0.9): """ Compute naive Gaussian based confidence From d02f25bc518b62930ad4930706dda595ee3f356e Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Sun, 31 Oct 2021 20:47:31 -0400 Subject: [PATCH 154/187] some more clean up --- selectinf/randomized/exact_reference.py | 4 +- selectinf/randomized/posterior_inference.py | 4 +- selectinf/randomized/query.py | 34 ++- selectinf/randomized/tests/test_posterior.py | 206 +++++++++--------- .../tests/test_selective_MLE_high.py | 2 - 5 files changed, 138 insertions(+), 112 deletions(-) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 13fdbd4a6..bcc78f3b7 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -27,6 +27,8 @@ def __init__(self, Estimated covaraince of target. cov_target_score : ndarray Estimated covariance of target and score of randomized query. + level : float, optional + Confidence level. solve_args : dict, optional Arguments passed to solver. """ @@ -83,7 +85,7 @@ def __init__(self, def summary(self, alternatives=None, parameter=None, - level=0.9): + level=0.90): """ Produce p-values and confidence intervals for targets of model including selected features diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 194b6c6b4..2467a35e4 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -82,7 +82,7 @@ def __init__(self, self.offset = offset self.initial_estimate = np.asarray(result['MLE']) - self.dispersion = dispersion # why is this needed? + self.dispersion = dispersion self.log_ref = log_ref self._set_marginal_parameters() @@ -159,12 +159,14 @@ def _set_marginal_parameters(self): bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) ###set parameters for the marginal distribution of optimization variables + _Q = np.linalg.inv(prec_target_nosel + T3) self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) ###set parameters for the marginal distribution of target + r = np.linalg.inv(prec_target_nosel).dot(self.prec_target.dot(bias_target)) S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index f9237e562..a423a0fb8 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -8,6 +8,7 @@ from .posterior_inference import posterior from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from .approx_reference import approximate_grid_inference +from .exact_reference import exact_grid_inference class query(object): r""" @@ -226,8 +227,8 @@ def prior(target_parameter): def approximate_grid_inference(self, target_spec, - solve_args={'tol': 1.e-12}, - useIP=True): + useIP=True, + solve_args={'tol': 1.e-12}): """ Parameters @@ -252,6 +253,32 @@ def approximate_grid_inference(self, return G.summary(alternatives=target_spec.alternatives) + def exact_grid_inference(self, + target_spec, + solve_args={'tol': 1.e-12}): + + """ + Parameters + ---------- + observed_target : ndarray + Observed estimate of target. + cov_target : ndarray + Estimated covaraince of target. + regress_target_score : ndarray + Estimated covariance of target and score of randomized query. + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + solve_args : dict, optional + Arguments passed to solver. + """ + + G = exact_grid_inference(self, + target_spec, + solve_args=solve_args) + + return G.summary(alternatives=target_spec.alternatives) + class multiple_queries(object): ''' @@ -540,9 +567,6 @@ def selective_MLE(target_spec, Conditional mean of optimization variables given target. cond_cov : ndarray Conditional covariance of optimization variables given target. - regress_opt : ndarray - Describes how conditional mean of optimization - variables varies with target. linear_part : ndarray Linear part of affine constraints: $\{o:Ao \leq b\}$ offset : ndarray diff --git a/selectinf/randomized/tests/test_posterior.py b/selectinf/randomized/tests/test_posterior.py index 3d972a585..2757c06da 100644 --- a/selectinf/randomized/tests/test_posterior.py +++ b/selectinf/randomized/tests/test_posterior.py @@ -263,109 +263,109 @@ def prior(target_parameter): return samples -def test_hiv_data(nsample=10000, - nburnin=500, - level=0.90, - split_proportion=0.50, - seedn=1): - np.random.seed(seedn) - - alpha = (1 - level) / 2 - Z_quantile = ndist.ppf(1 - alpha) - - X, Y, _ = HIV_NRTI(standardize=True) - Y *= 15 - n, p = X.shape - X /= np.sqrt(n) - - ols_fit = np.linalg.pinv(X).dot(Y) - _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) - - const = split_lasso.gaussian - - dispersion = _sigma ** 2 - - W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma - - conv = const(X, - Y, - W, - proportion=split_proportion) - - signs = conv.fit() - nonzero = signs != 0 - - conv.setup_inference() - - target_spec = selected_targets(conv.loglike, - conv.observed_soln, - nonzero, - dispersion=dispersion) - - mle, inverse_info = conv.selective_MLE(target_spec, - level=level, - solve_args={'tol': 1.e-12})[:2] - - approx_inf = conv.approximate_grid_inference(target_spec) - - posterior_inf = conv.posterior(target_spec, - dispersion=dispersion) - - samples_langevin = langevin_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin, - step=1.) - - lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) - upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) - - samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, - nsample=nsample, - nburnin=nburnin) - - lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) - upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) - - naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) - naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) - naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), - naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T - - X_split = X[~conv._selection_idx, :] - Y_split = Y[~conv._selection_idx] - split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) - split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) - split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), - split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T - - print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", - np.mean(upper_langevin - lower_langevin), - np.mean(upper_gibbs - lower_gibbs), - np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), - np.mean(mle['upper_confidence'] - mle['lower_confidence']), - np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) - ) - - print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) - - print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) - - scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) - output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, - 'Langevin_upper_credible': upper_langevin, - 'Gibbs_lower_credible': lower_gibbs, - 'Gibbs_upper_credible': upper_gibbs, - 'MLE_lower_confidence': mle['lower_confidence'], - 'MLE_upper_confidence': mle['upper_confidence'], - 'approx_lower_confidence': approx_inf['lower_confidence'], - 'approx_upper_confidence': approx_inf['upper_confidence'], - 'Split_lower_confidence': split_intervals[:, 0], - 'Split_upper_confidence': split_intervals[:, 1], - 'Naive_lower_confidence': naive_intervals[:, 0], - 'Naive_upper_confidence': naive_intervals[:, 1] - }) - - return output, scale_interval, _sigma +# def test_hiv_data(nsample=10000, +# nburnin=500, +# level=0.90, +# split_proportion=0.50, +# seedn=1): +# np.random.seed(seedn) +# +# alpha = (1 - level) / 2 +# Z_quantile = ndist.ppf(1 - alpha) +# +# X, Y, _ = HIV_NRTI(standardize=True) +# Y *= 15 +# n, p = X.shape +# X /= np.sqrt(n) +# +# ols_fit = np.linalg.pinv(X).dot(Y) +# _sigma = np.linalg.norm(Y - X.dot(ols_fit)) / np.sqrt(n - p - 1) +# +# const = split_lasso.gaussian +# +# dispersion = _sigma ** 2 +# +# W = 1 * np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * _sigma +# +# conv = const(X, +# Y, +# W, +# proportion=split_proportion) +# +# signs = conv.fit() +# nonzero = signs != 0 +# +# conv.setup_inference() +# +# target_spec = selected_targets(conv.loglike, +# conv.observed_soln, +# nonzero, +# dispersion=dispersion) +# +# mle, inverse_info = conv.selective_MLE(target_spec, +# level=level, +# solve_args={'tol': 1.e-12})[:2] +# +# approx_inf = conv.approximate_grid_inference(target_spec) +# +# posterior_inf = conv.posterior(target_spec, +# dispersion=dispersion) +# +# samples_langevin = langevin_sampler(posterior_inf, +# nsample=nsample, +# nburnin=nburnin, +# step=1.) +# +# lower_langevin = np.percentile(samples_langevin, int(alpha * 100), axis=0) +# upper_langevin = np.percentile(samples_langevin, int((1 - alpha) * 100), axis=0) +# +# samples_gibbs, scale_gibbs = gibbs_sampler(posterior_inf, +# nsample=nsample, +# nburnin=nburnin) +# +# lower_gibbs = np.percentile(samples_gibbs, int(alpha * 100), axis=0) +# upper_gibbs = np.percentile(samples_gibbs, int((1 - alpha) * 100), axis=0) +# +# naive_est = np.linalg.pinv(X[:, nonzero]).dot(Y) +# naive_cov = dispersion * np.linalg.inv(X[:, nonzero].T.dot(X[:, nonzero])) +# naive_intervals = np.vstack([naive_est - Z_quantile * np.sqrt(np.diag(naive_cov)), +# naive_est + Z_quantile * np.sqrt(np.diag(naive_cov))]).T +# +# X_split = X[~conv._selection_idx, :] +# Y_split = Y[~conv._selection_idx] +# split_est = np.linalg.pinv(X_split[:, nonzero]).dot(Y_split) +# split_cov = dispersion * np.linalg.inv(X_split[:, nonzero].T.dot(X_split[:, nonzero])) +# split_intervals = np.vstack([split_est - Z_quantile * np.sqrt(np.diag(split_cov)), +# split_est + Z_quantile * np.sqrt(np.diag(split_cov))]).T +# +# print("lengths: adjusted intervals Langevin, Gibbs, MLE1, MLE2, approx ", +# np.mean(upper_langevin - lower_langevin), +# np.mean(upper_gibbs - lower_gibbs), +# np.mean((2 * Z_quantile) * np.sqrt(np.diag(posterior_inf.inverse_info))), +# np.mean(mle['upper_confidence'] - mle['lower_confidence']), +# np.mean(approx_inf['upper_confidence'] - approx_inf['lower_confidence']) +# ) +# +# print("lengths: naive intervals ", np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])) +# +# print("lengths: split intervals ", np.mean(split_intervals[:, 1] - split_intervals[:, 0])) +# +# scale_interval = np.percentile(scale_gibbs, [alpha * 100, (1 - alpha) * 100]) +# output = pd.DataFrame({'Langevin_lower_credible': lower_langevin, +# 'Langevin_upper_credible': upper_langevin, +# 'Gibbs_lower_credible': lower_gibbs, +# 'Gibbs_upper_credible': upper_gibbs, +# 'MLE_lower_confidence': mle['lower_confidence'], +# 'MLE_upper_confidence': mle['upper_confidence'], +# 'approx_lower_confidence': approx_inf['lower_confidence'], +# 'approx_upper_confidence': approx_inf['upper_confidence'], +# 'Split_lower_confidence': split_intervals[:, 0], +# 'Split_upper_confidence': split_intervals[:, 1], +# 'Naive_lower_confidence': naive_intervals[:, 0], +# 'Naive_upper_confidence': naive_intervals[:, 1] +# }) +# +# return output, scale_interval, _sigma if __name__ == "__main__": diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 947e75bcd..444748b8d 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -174,7 +174,6 @@ def test_instance(): M = E.copy() M[-3:] = 1 - print("check ", M) dispersion = np.linalg.norm(Y - X[:, M].dot(np.linalg.pinv(X[:, M]).dot(Y))) ** 2 / (n - M.sum()) L.setup_inference(dispersion=dispersion) @@ -196,7 +195,6 @@ def test_instance(): return coverage - def test_selected_targets_disperse(n=500, p=100, s=5, From 15aaa6e69c9235934b91733a85a6a1541a8a8a6c Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Wed, 3 Nov 2021 17:28:13 -0400 Subject: [PATCH 155/187] added class for MLE based inference --- selectinf/randomized/selective_MLE.py | 118 ++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 selectinf/randomized/selective_MLE.py diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py new file mode 100644 index 000000000..2b8f6f9e9 --- /dev/null +++ b/selectinf/randomized/selective_MLE.py @@ -0,0 +1,118 @@ +from __future__ import division, print_function + +import numpy as np, pandas as pd +from scipy.stats import norm as ndist +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from ..algorithms.barrier_affine import solve_barrier_affine_py + +class selective_MLE(object): + + def __init__(self, + query, + target_spec, + solve_args={'tol': 1.e-12}): + + self.solve_args = solve_args + + (observed_target, + cov_target, + regress_target_score) = target_spec[:3] + + self.observed_target = observed_target + self.cov_target = cov_target + self.prec_target = np.linalg.inv(cov_target) + self.regress_target_score = regress_target_score + + self.cond_mean = query.cond_mean + self.cond_cov = query.cond_cov + self.prec_opt = np.linalg.inv(self.cond_cov) + self.opt_linear = query.opt_linear + + self.linear_part = query.sampler.affine_con.linear_part + self.offset = query.sampler.affine_con.offset + + self.M1 = query.M1 + self.M2 = query.M2 + self.M3 = query.M3 + self.observed_soln = query.observed_opt_state + + self.observed_score = query.observed_score_state + query.observed_subgrad + + self._setup_estimating_eqn() + + def mle_inference(self, useC= False, level=0.90): + + conjugate_arg = self.prec_opt.dot(self.cond_mean) + if useC: + solver = solve_barrier_affine_C + else: + solver = solve_barrier_affine_py + + val, soln, hess = solver(conjugate_arg, + self.prec_opt, + self.observed_soln, + self.linear_part, + self.offset, + **self.solve_args) + + final_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) \ + + self.regress_target_score.dot(self.M1.dot(self.opt_linear)).dot(self.cond_mean - soln) \ + - self.bias_target + + observed_info_natural = self.prec_target_nosel + self.T3 - self.T5.dot(self.hess.dot(self.T5.T)) + + unbiased_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) - self.bias_target + + observed_info_mean = self.cov_target.dot(observed_info_natural.dot(self.cov_target)) + + Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) + + pvalues = ndist.cdf(Z_scores) + + pvalues = 2 * np.minimum(pvalues, 1 - pvalues) + + alpha = 1. - level + + quantile = ndist.ppf(1 - alpha / 2.) + + intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), + final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T + + log_ref = val + conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2. + + result = pd.DataFrame({'MLE': final_estimator, + 'SE': np.sqrt(np.diag(observed_info_mean)), + 'Zvalue': Z_scores, + 'pvalue': pvalues, + 'lower_confidence': intervals[:, 0], + 'upper_confidence': intervals[:, 1], + 'unbiased': unbiased_estimator}) + + return result, observed_info_mean, log_ref + + def _setup_estimating_eqn(self): + + T1 = self.regress_target_score.T.dot(self.prec_target) + T2 = T1.T.dot(self.M2.dot(T1)) + T3 = T1.T.dot(self.M3.dot(T1)) + T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + + self.prec_target_nosel = self.prec_target + T2 - T3 + + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(self.observed_target)) + + self.bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + + self.T3 = T3 + self.T5 = T5 + + + + + + + + + From 56b0902d630f207ac4b002a2b173f53de77600de Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Wed, 3 Nov 2021 17:28:44 -0400 Subject: [PATCH 156/187] other changes --- selectinf/randomized/approx_reference.py | 4 +- selectinf/randomized/exact_reference.py | 5 +- selectinf/randomized/posterior_inference.py | 4 +- selectinf/randomized/query.py | 253 +------------------- selectinf/randomized/tests/test_naive.py | 43 +++- 5 files changed, 47 insertions(+), 262 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 31c3b88e3..38483f4c6 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -39,8 +39,8 @@ def __init__(self, self.solve_args = solve_args - linear_part = query.sampler.affine_con.linear_part - offset = query.sampler.affine_con.offset + linear_part = query.affine_con.linear_part + offset = query.affine_con.offset opt_linear = query.opt_linear diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index bcc78f3b7..ce799a47c 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -1,7 +1,6 @@ from __future__ import division, print_function import numpy as np, pandas as pd -from scipy.interpolate import interp1d from scipy.stats import norm as ndist from ..distributions.discrete_family import discrete_family @@ -39,8 +38,8 @@ def __init__(self, self.solve_args = solve_args - linear_part = query.sampler.affine_con.linear_part - offset = query.sampler.affine_con.offset + linear_part = query.affine_con.linear_part + offset = query.affine_con.offset opt_linear = query.opt_linear diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 2467a35e4..7fa5b377b 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -47,8 +47,8 @@ def __init__(self, cov_target, regress_target_score) = target_spec[:3] - linear_part = query.sampler.affine_con.linear_part - offset = query.sampler.affine_con.offset + linear_part = query.affine_con.linear_part + offset = query.affine_con.offset opt_linear = query.opt_linear diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index a423a0fb8..9a562c8e4 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -6,7 +6,6 @@ from ..algorithms.barrier_affine import solve_barrier_affine_py from .posterior_inference import posterior -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from .approx_reference import approximate_grid_inference from .exact_reference import exact_grid_inference @@ -280,257 +279,7 @@ def exact_grid_inference(self, return G.summary(alternatives=target_spec.alternatives) -class multiple_queries(object): - ''' - Combine several queries of a given data - through randomized algorithms. - ''' - - def __init__(self, objectives): - ''' - Parameters - ---------- - objectives : sequence - A sequences of randomized objective functions. - Notes - ----- - Each element of `objectives` must - have a `setup_sampler` method that returns - a description of the distribution of the - data implicated in the objective function, - typically through the score or gradient - of the objective function. - These descriptions are passed to a function - `form_covariances` to linearly decompose - each score in terms of a target - and an asymptotically independent piece. - Returns - ------- - None - ''' - - self.objectives = objectives - - def fit(self): - for objective in self.objectives: - if not objective._setup: - objective.fit() - - def summary(self, - target_specs, - # a sequence of target_specs - # objects in theory all cov_target - # should be about the same. as should the observed_target - alternatives=None, - parameter=None, - level=0.9, - ndraw=5000, - burnin=2000, - compute_intervals=False): - - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - level : float - Confidence level. - ndraw : int (optional) - Defaults to 1000. - burnin : int (optional) - Defaults to 1000. - compute_intervals : bool - Compute confidence intervals? - """ - - observed_target = target_specs[0].observed_target - alternatives = target_specs[0].alternatives - - if parameter is None: - parameter = np.zeros_like(observed_target) - - if alternatives is None: - alternatives = ['twosided'] * observed_target.shape[0] - - if len(self.objectives) != len(target_specs): - raise ValueError("number of objectives and sampling cov infos do not match") - - self.opt_sampling_info = [] - for i in range(len(self.objectives)): - if target_specs[i].cov_target is None or target_specs[i].regress_target_score is None: - raise ValueError("did not input target and score covariance info") - opt_sample, opt_logW = self.objectives[i].sampler.sample(ndraw, burnin) - self.opt_sampling_info.append((self.objectives[i].sampler, - opt_sample, - opt_logW, - target_specs[i].cov_target, - target_specs[i].regress_target_score)) - - pivots = self.coefficient_pvalues(observed_target, - parameter=parameter, - alternatives=alternatives) - - if not np.all(parameter == 0): - pvalues = self.coefficient_pvalues(observed_target, - parameter=np.zeros_like(observed_target), - alternatives=alternatives) - else: - pvalues = pivots - - intervals = None - if compute_intervals: - intervals = self.confidence_intervals(observed_target, - level) - - result = pd.DataFrame({'target': observed_target, - 'pvalue': pvalues, - 'lower_confidence': intervals[:, 0], - 'upper_confidence': intervals[:, 1]}) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result - - def coefficient_pvalues(self, - observed_target, - parameter=None, - sample_args=(), - alternatives=None): - - ''' - Construct selective p-values - for each parameter of the target. - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - parameter : ndarray (optional) - A vector of parameters with shape `self.shape` - at which to evaluate p-values. Defaults - to `np.zeros(self.shape)`. - sample_args : sequence - Arguments to `self.sample` if sample is not found - for a given objective. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - Returns - ------- - pvalues : ndarray - ''' - - for i in range(len(self.objectives)): - if self.opt_sampling_info[i][1] is None: - _sample, _logW = self.objectives[i].sampler.sample(*sample_args) - self.opt_sampling_info[i][1] = _sample - self.opt_sampling_info[i][2] = _logW - - ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective - - _intervals = optimization_intervals(self.opt_sampling_info, - observed_target, - ndraw) - - pvals = [] - - for i in range(observed_target.shape[0]): - keep = np.zeros_like(observed_target) - keep[i] = 1. - pvals.append(_intervals.pivot(keep, candidate=parameter[i], alternative=alternatives[i])) - - return np.array(pvals) - - def confidence_intervals(self, - target_specs, - sample_args=(), - level=0.9): - - ''' - Construct selective confidence intervals - for each parameter of the target. - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - sample_args : sequence - Arguments to `self.sample` if sample is not found - for a given objective. - level : float - Confidence level. - Returns - ------- - limits : ndarray - Confidence intervals for each target. - ''' - - for i in range(len(self.objectives)): - if self.opt_sampling_info[i][1] is None: - _sample, _logW = self.objectives[i].sampler.sample(*sample_args) - self.opt_sampling_info[i][1] = _sample - self.opt_sampling_info[i][2] = _logW - - ndraw = self.opt_sampling_info[0][1].shape[0] # nsample for normal samples taken from the 1st objective - - _intervals = optimization_intervals(self.opt_sampling_info, - observed_target, - ndraw) - - limits = [] - - for i in range(observed_target.shape[0]): - keep = np.zeros_like(observed_target) - keep[i] = 1. - limits.append(_intervals.confidence_interval(keep, level=level)) - - return np.array(limits) - - -def naive_confidence_intervals(diag_cov, observed, level=0.9): - """ - Compute naive Gaussian based confidence - intervals for target. - Parameters - ---------- - diag_cov : diagonal of a covariance matrix - observed : np.float - A vector of observed data of shape `target.shape` - alpha : float (optional) - 1 - confidence level. - Returns - ------- - intervals : np.float - Gaussian based confidence intervals. - """ - alpha = 1 - level - diag_cov = np.asarray(diag_cov) - p = diag_cov.shape[0] - quantile = - ndist.ppf(alpha / 2) - LU = np.zeros((2, p)) - for j in range(p): - sigma = np.sqrt(diag_cov[j]) - LU[0, j] = observed[j] - sigma * quantile - LU[1, j] = observed[j] + sigma * quantile - return LU.T - - -def naive_pvalues(diag_cov, observed, parameter): - diag_cov = np.asarray(diag_cov) - p = diag_cov.shape[0] - pvalues = np.zeros(p) - for j in range(p): - sigma = np.sqrt(diag_cov[j]) - pval = ndist.cdf((observed[j] - parameter[j]) / sigma) - pvalues[j] = 2 * min(pval, 1 - pval) - return pvalues +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C def selective_MLE(target_spec, observed_soln, # initial (observed) value of diff --git a/selectinf/randomized/tests/test_naive.py b/selectinf/randomized/tests/test_naive.py index 584535dc7..56f7f5515 100644 --- a/selectinf/randomized/tests/test_naive.py +++ b/selectinf/randomized/tests/test_naive.py @@ -1,15 +1,52 @@ import numpy as np import regreg.api as rr -import pandas as pd from scipy.stats import norm as ndist -from scipy.optimize import bisect + from ...tests.instance import gaussian_instance from ...algorithms.lasso import lasso from ...tests.flags import SMALL_SAMPLES, SET_SEED from ...tests.decorators import wait_for_return_value, set_seed_iftrue, set_sampling_params_iftrue from ..cv_view import CV_view, have_glmnet -from ..query import (naive_pvalues, naive_confidence_intervals) + + +def naive_confidence_intervals(diag_cov, observed, level=0.9): + """ + Compute naive Gaussian based confidence + intervals for target. + Parameters + ---------- + diag_cov : diagonal of a covariance matrix + observed : np.float + A vector of observed data of shape `target.shape` + alpha : float (optional) + 1 - confidence level. + Returns + ------- + intervals : np.float + Gaussian based confidence intervals. + """ + alpha = 1 - level + diag_cov = np.asarray(diag_cov) + p = diag_cov.shape[0] + quantile = - ndist.ppf(alpha / 2) + LU = np.zeros((2, p)) + for j in range(p): + sigma = np.sqrt(diag_cov[j]) + LU[0, j] = observed[j] - sigma * quantile + LU[1, j] = observed[j] + sigma * quantile + return LU.T + + +def naive_pvalues(diag_cov, observed, parameter): + diag_cov = np.asarray(diag_cov) + p = diag_cov.shape[0] + pvalues = np.zeros(p) + for j in range(p): + sigma = np.sqrt(diag_cov[j]) + pval = ndist.cdf((observed[j] - parameter[j]) / sigma) + pvalues[j] = 2 * min(pval, 1 - pval) + return pvalues def compute_projection_parameters(n, p, s, signal, rho, sigma, active): multiple = 10**2 From 91353fc2ce3bf9637adc975263ea0a0ddcef4f87 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 4 Nov 2021 16:12:16 -0400 Subject: [PATCH 157/187] some more clean up for query --- selectinf/randomized/query.py | 287 +++++++++++++------------- selectinf/randomized/selective_MLE.py | 2 +- 2 files changed, 147 insertions(+), 142 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 9a562c8e4..3f0610abb 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1,13 +1,10 @@ import numpy as np -import pandas as pd -from scipy.stats import norm as ndist from ..constraints.affine import constraints -from ..algorithms.barrier_affine import solve_barrier_affine_py - from .posterior_inference import posterior from .approx_reference import approximate_grid_inference from .exact_reference import exact_grid_inference +from .selective_MLE import mle_inference class query(object): r""" @@ -170,21 +167,31 @@ def selective_MLE(self, level=0.90, solve_args={'tol': 1.e-12}): - return selective_MLE(target_spec, - self.observed_opt_state, - self.affine_con.mean, - self.affine_con.covariance, - self.affine_con.linear_part, - self.affine_con.offset, - self.opt_linear, - self.M1, - self.M2, - self.M3, - self.observed_score_state + self.observed_subgrad, - solve_args=solve_args, - level=level, - useC=False) - + G = mle_inference(self, + target_spec, + solve_args=solve_args) + + return G.mle_inference(level=level) + + # def selective_MLE(self, + # target_spec, + # level=0.90, + # solve_args={'tol': 1.e-12}): + # + # return selective_MLE(target_spec, + # self.observed_opt_state, + # self.affine_con.mean, + # self.affine_con.covariance, + # self.affine_con.linear_part, + # self.affine_con.offset, + # self.opt_linear, + # self.M1, + # self.M2, + # self.M3, + # self.observed_score_state + self.observed_subgrad, + # solve_args=solve_args, + # level=level, + # useC=False) def posterior(self, target_spec, @@ -279,126 +286,124 @@ def exact_grid_inference(self, return G.summary(alternatives=target_spec.alternatives) -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C - -def selective_MLE(target_spec, - observed_soln, # initial (observed) value of - # optimization variables -- used as a - # feasible point. precise value used - # only for independent estimator - cond_mean, - cond_cov, - linear_part, - offset, - opt_linear, - M1, - M2, - M3, - observed_score, - solve_args={'tol': 1.e-12}, - level=0.9, - useC=False): - - """ - Selective MLE based on approximation of - CGF. - Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated regression coefficient of target on score. - observed_soln : ndarray - Feasible point for optimization problem. - cond_mean : ndarray - Conditional mean of optimization variables given target. - cond_cov : ndarray - Conditional covariance of optimization variables given target. - linear_part : ndarray - Linear part of affine constraints: $\{o:Ao \leq b\}$ - offset : ndarray - Offset part of affine constraints: $\{o:Ao \leq b\}$ - solve_args : dict, optional - Arguments passed to solver. - level : float, optional - Confidence level. - useC : bool, optional - Use python or C solver. - """ - - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - if np.asarray(observed_target).shape in [(), (0,)]: - raise ValueError('no target specified') - - observed_target = np.atleast_1d(observed_target) - prec_target = np.linalg.inv(cov_target) - - prec_opt = np.linalg.inv(cond_cov) - - # this is specific to target - - T1 = regress_target_score.T.dot(prec_target) - T2 = T1.T.dot(M2.dot(T1)) - T3 = T1.T.dot(M3.dot(T1)) - T4 = M1.dot(opt_linear).dot(cond_cov).dot(opt_linear.T.dot(M1.T.dot(T1))) - T5 = T1.T.dot(M1.dot(opt_linear)) - - prec_target_nosel = prec_target + T2 - T3 - - _P = -(T1.T.dot(M1.dot(observed_score)) + T2.dot(observed_target)) ##flipped sign of second term here - - bias_target = cov_target.dot(T1.T.dot(-T4.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) - _P) - - conjugate_arg = prec_opt.dot(cond_mean) - - if useC: - solver = solve_barrier_affine_C - else: - solver = solve_barrier_affine_py - - val, soln, hess = solver(conjugate_arg, - prec_opt, - observed_soln, - linear_part, - offset, - **solve_args) - - final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ - + regress_target_score.dot(M1.dot(opt_linear)).dot(cond_mean - soln) - bias_target - - observed_info_natural = prec_target_nosel + T3 - T5.dot(hess.dot(T5.T)) - - unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) - bias_target - - observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) - - Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) - - pvalues = ndist.cdf(Z_scores) - - pvalues = 2 * np.minimum(pvalues, 1 - pvalues) - - alpha = 1. - level - - quantile = ndist.ppf(1 - alpha / 2.) - - intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), - final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - - log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. - - result = pd.DataFrame({'MLE': final_estimator, - 'SE': np.sqrt(np.diag(observed_info_mean)), - 'Zvalue': Z_scores, - 'pvalue': pvalues, - 'lower_confidence': intervals[:, 0], - 'upper_confidence': intervals[:, 1], - 'unbiased': unbiased_estimator}) - - return result, observed_info_mean, log_ref +# def selective_MLE(target_spec, +# observed_soln, # initial (observed) value of +# # optimization variables -- used as a +# # feasible point. precise value used +# # only for independent estimator +# cond_mean, +# cond_cov, +# linear_part, +# offset, +# opt_linear, +# M1, +# M2, +# M3, +# observed_score, +# solve_args={'tol': 1.e-12}, +# level=0.9, +# useC=False): +# +# """ +# Selective MLE based on approximation of +# CGF. +# Parameters +# ---------- +# observed_target : ndarray +# Observed estimate of target. +# cov_target : ndarray +# Estimated covaraince of target. +# regress_target_score : ndarray +# Estimated regression coefficient of target on score. +# observed_soln : ndarray +# Feasible point for optimization problem. +# cond_mean : ndarray +# Conditional mean of optimization variables given target. +# cond_cov : ndarray +# Conditional covariance of optimization variables given target. +# linear_part : ndarray +# Linear part of affine constraints: $\{o:Ao \leq b\}$ +# offset : ndarray +# Offset part of affine constraints: $\{o:Ao \leq b\}$ +# solve_args : dict, optional +# Arguments passed to solver. +# level : float, optional +# Confidence level. +# useC : bool, optional +# Use python or C solver. +# """ +# +# (observed_target, +# cov_target, +# regress_target_score) = target_spec[:3] +# +# if np.asarray(observed_target).shape in [(), (0,)]: +# raise ValueError('no target specified') +# +# observed_target = np.atleast_1d(observed_target) +# prec_target = np.linalg.inv(cov_target) +# +# prec_opt = np.linalg.inv(cond_cov) +# +# # this is specific to target +# +# T1 = regress_target_score.T.dot(prec_target) +# T2 = T1.T.dot(M2.dot(T1)) +# T3 = T1.T.dot(M3.dot(T1)) +# T4 = M1.dot(opt_linear).dot(cond_cov).dot(opt_linear.T.dot(M1.T.dot(T1))) +# T5 = T1.T.dot(M1.dot(opt_linear)) +# +# prec_target_nosel = prec_target + T2 - T3 +# +# _P = -(T1.T.dot(M1.dot(observed_score)) + T2.dot(observed_target)) ##flipped sign of second term here +# +# bias_target = cov_target.dot(T1.T.dot(-T4.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) - _P) +# +# conjugate_arg = prec_opt.dot(cond_mean) +# +# if useC: +# solver = solve_barrier_affine_C +# else: +# solver = solve_barrier_affine_py +# +# val, soln, hess = solver(conjugate_arg, +# prec_opt, +# observed_soln, +# linear_part, +# offset, +# **solve_args) +# +# final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ +# + regress_target_score.dot(M1.dot(opt_linear)).dot(cond_mean - soln) - bias_target +# +# observed_info_natural = prec_target_nosel + T3 - T5.dot(hess.dot(T5.T)) +# +# unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) - bias_target +# +# observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) +# +# Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) +# +# pvalues = ndist.cdf(Z_scores) +# +# pvalues = 2 * np.minimum(pvalues, 1 - pvalues) +# +# alpha = 1. - level +# +# quantile = ndist.ppf(1 - alpha / 2.) +# +# intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), +# final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T +# +# log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. +# +# result = pd.DataFrame({'MLE': final_estimator, +# 'SE': np.sqrt(np.diag(observed_info_mean)), +# 'Zvalue': Z_scores, +# 'pvalue': pvalues, +# 'lower_confidence': intervals[:, 0], +# 'upper_confidence': intervals[:, 1], +# 'unbiased': unbiased_estimator}) +# +# return result, observed_info_mean, log_ref diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 2b8f6f9e9..96ceae97b 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -5,7 +5,7 @@ from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from ..algorithms.barrier_affine import solve_barrier_affine_py -class selective_MLE(object): +class mle_inference(object): def __init__(self, query, From 5e448a19daae0d68f1f83b313b5d496c58ccecea Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 4 Nov 2021 16:40:46 -0400 Subject: [PATCH 158/187] removed regress_opt from return list --- selectinf/randomized/query.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 3f0610abb..6d8d0749a 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -103,7 +103,6 @@ def _setup_sampler(self, (cond_mean, cond_cov, cond_precision, - regress_opt, M1, M2, M3) = self._setup_implied_gaussian(opt_linear, @@ -157,7 +156,6 @@ def _setup_implied_gaussian(self, return (cond_mean, cond_cov, cond_precision, - regress_opt, M1, M2, M3) From 6f38200bb4989256c0f6e1b944c14f4cec28ecba Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 4 Nov 2021 16:53:21 -0400 Subject: [PATCH 159/187] changed some names of variables in posterior: for consistency --- selectinf/randomized/posterior_inference.py | 50 +++++++++------------ 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 7fa5b377b..595dca61d 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -47,48 +47,42 @@ def __init__(self, cov_target, regress_target_score) = target_spec[:3] - linear_part = query.affine_con.linear_part - offset = query.affine_con.offset - - opt_linear = query.opt_linear - - observed_score = query.observed_score_state + query.observed_subgrad - - result, self.inverse_info, log_ref = query.selective_MLE(target_spec) - - ### Note for an informative prior we might want to change this... - - cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(cond_cov) - self.cond_cov = cond_cov + self.observed_target = observed_target self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) + self.regress_target_score = regress_target_score - self.ntarget = self.cov_target.shape[0] - self.nopt = self.cond_precision.shape[0] + self.cond_mean = query.cond_mean + self.cond_cov = query.cond_cov + self.prec_opt = np.linalg.inv(self.cond_cov) + self.opt_linear = query.opt_linear - self.observed_target = observed_target - self.regress_target_score = regress_target_score - self.opt_linear = opt_linear - self.observed_score = observed_score + self.linear_part = query.affine_con.linear_part + self.offset = query.affine_con.offset self.M1 = query.M1 self.M2 = query.M2 self.M3 = query.M3 - self.feasible_point = query.observed_opt_state + self.observed_soln = query.observed_opt_state + + self.observed_score = query.observed_score_state + query.observed_subgrad + + result, self.inverse_info, log_ref = query.selective_MLE(target_spec) + + + self.ntarget = self.cov_target.shape[0] + self.nopt = self.prec_opt.shape[0] - self.cond_mean = query.cond_mean - self.linear_part = linear_part - self.offset = offset self.initial_estimate = np.asarray(result['MLE']) self.dispersion = dispersion self.log_ref = log_ref - self._set_marginal_parameters() - + ### Note for an informative prior we might want to change this... self.prior = prior + self._set_marginal_parameters() + def log_posterior(self, target_parameter, sigma=1): @@ -115,7 +109,7 @@ def log_posterior(self, val, soln, hess = solver(conjugate_marginal, prec_marginal, - self.feasible_point, + self.observed_soln, self.linear_part, self.offset, **self.solve_args) @@ -161,7 +155,7 @@ def _set_marginal_parameters(self): ###set parameters for the marginal distribution of optimization variables _Q = np.linalg.inv(prec_target_nosel + T3) - self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) + self.prec_marginal = self.prec_opt - T5.T.dot(_Q).dot(T5) self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) From 69e7dd1db1fb3c39d010fce29844c3704803f96b Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 4 Nov 2021 17:00:18 -0400 Subject: [PATCH 160/187] changed some names of variables: for consistency --- selectinf/randomized/exact_reference.py | 39 ++--- selectinf/randomized/posterior_inference.py | 6 +- selectinf/randomized/query.py | 156 ++------------------ selectinf/randomized/selective_MLE.py | 6 +- 4 files changed, 37 insertions(+), 170 deletions(-) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index ce799a47c..00702a6b7 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -32,42 +32,33 @@ def __init__(self, Arguments passed to solver. """ + self.solve_args = solve_args + (observed_target, cov_target, regress_target_score) = target_spec[:3] - - self.solve_args = solve_args - - linear_part = query.affine_con.linear_part - offset = query.affine_con.offset - - opt_linear = query.opt_linear - - observed_score = query.observed_score_state + query.observed_subgrad - - result, inverse_info, log_ref = query.selective_MLE(target_spec) - cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(cond_cov) - self.cond_cov = cond_cov + self.observed_target = observed_target self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) - - self.observed_target = observed_target self.regress_target_score = regress_target_score - self.opt_linear = opt_linear - self.observed_score = observed_score + + self.cond_mean = query.cond_mean + self.cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(self.cond_cov) + self.opt_linear = query.opt_linear + + self.linear_part = query.affine_con.linear_part + self.offset = query.affine_con.offset self.M1 = query.M1 self.M2 = query.M2 self.M3 = query.M3 - self.feasible_point = query.observed_opt_state + self.observed_soln = query.observed_opt_state - self.cond_mean = query.cond_mean - self.linear_part = linear_part - self.offset = offset + self.observed_score = query.observed_score_state + query.observed_subgrad - self.feasible_point = query.observed_opt_state + result, inverse_info, log_ref = query.selective_MLE(target_spec) self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -156,7 +147,7 @@ def log_reference(self, R = np.identity(num_opt) - _A.dot(eta.T) A = self.linear_part.dot(_A).reshape((-1,)) - b = -self.linear_part.dot(R).dot(self.feasible_point) + b = -self.linear_part.dot(R).dot(self.observed_soln) trunc_ = np.true_divide((self.offset + b), A) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 595dca61d..63b478dec 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -54,7 +54,7 @@ def __init__(self, self.cond_mean = query.cond_mean self.cond_cov = query.cond_cov - self.prec_opt = np.linalg.inv(self.cond_cov) + self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear self.linear_part = query.affine_con.linear_part @@ -71,7 +71,7 @@ def __init__(self, self.ntarget = self.cov_target.shape[0] - self.nopt = self.prec_opt.shape[0] + self.nopt = self.cond_precision.shape[0] self.initial_estimate = np.asarray(result['MLE']) @@ -155,7 +155,7 @@ def _set_marginal_parameters(self): ###set parameters for the marginal distribution of optimization variables _Q = np.linalg.inv(prec_target_nosel + T3) - self.prec_marginal = self.prec_opt - T5.T.dot(_Q).dot(T5) + self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) self.linear_coef = self.cond_cov.dot(T5.T) self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 6d8d0749a..adf218825 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -165,32 +165,28 @@ def selective_MLE(self, level=0.90, solve_args={'tol': 1.e-12}): + """ + Parameters + ---------- + observed_target : ndarray + Observed estimate of target. + cov_target : ndarray + Estimated covaraince of target. + regress_target_score : ndarray + Estimated covariance of target and score of randomized query. + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + solve_args : dict, optional + Arguments passed to solver. + """ + G = mle_inference(self, target_spec, solve_args=solve_args) return G.mle_inference(level=level) - # def selective_MLE(self, - # target_spec, - # level=0.90, - # solve_args={'tol': 1.e-12}): - # - # return selective_MLE(target_spec, - # self.observed_opt_state, - # self.affine_con.mean, - # self.affine_con.covariance, - # self.affine_con.linear_part, - # self.affine_con.offset, - # self.opt_linear, - # self.M1, - # self.M2, - # self.M3, - # self.observed_score_state + self.observed_subgrad, - # solve_args=solve_args, - # level=level, - # useC=False) - def posterior(self, target_spec, dispersion=1, @@ -284,124 +280,4 @@ def exact_grid_inference(self, return G.summary(alternatives=target_spec.alternatives) -# def selective_MLE(target_spec, -# observed_soln, # initial (observed) value of -# # optimization variables -- used as a -# # feasible point. precise value used -# # only for independent estimator -# cond_mean, -# cond_cov, -# linear_part, -# offset, -# opt_linear, -# M1, -# M2, -# M3, -# observed_score, -# solve_args={'tol': 1.e-12}, -# level=0.9, -# useC=False): -# -# """ -# Selective MLE based on approximation of -# CGF. -# Parameters -# ---------- -# observed_target : ndarray -# Observed estimate of target. -# cov_target : ndarray -# Estimated covaraince of target. -# regress_target_score : ndarray -# Estimated regression coefficient of target on score. -# observed_soln : ndarray -# Feasible point for optimization problem. -# cond_mean : ndarray -# Conditional mean of optimization variables given target. -# cond_cov : ndarray -# Conditional covariance of optimization variables given target. -# linear_part : ndarray -# Linear part of affine constraints: $\{o:Ao \leq b\}$ -# offset : ndarray -# Offset part of affine constraints: $\{o:Ao \leq b\}$ -# solve_args : dict, optional -# Arguments passed to solver. -# level : float, optional -# Confidence level. -# useC : bool, optional -# Use python or C solver. -# """ -# -# (observed_target, -# cov_target, -# regress_target_score) = target_spec[:3] -# -# if np.asarray(observed_target).shape in [(), (0,)]: -# raise ValueError('no target specified') -# -# observed_target = np.atleast_1d(observed_target) -# prec_target = np.linalg.inv(cov_target) -# -# prec_opt = np.linalg.inv(cond_cov) -# -# # this is specific to target -# -# T1 = regress_target_score.T.dot(prec_target) -# T2 = T1.T.dot(M2.dot(T1)) -# T3 = T1.T.dot(M3.dot(T1)) -# T4 = M1.dot(opt_linear).dot(cond_cov).dot(opt_linear.T.dot(M1.T.dot(T1))) -# T5 = T1.T.dot(M1.dot(opt_linear)) -# -# prec_target_nosel = prec_target + T2 - T3 -# -# _P = -(T1.T.dot(M1.dot(observed_score)) + T2.dot(observed_target)) ##flipped sign of second term here -# -# bias_target = cov_target.dot(T1.T.dot(-T4.dot(observed_target) + M1.dot(opt_linear.dot(cond_mean))) - _P) -# -# conjugate_arg = prec_opt.dot(cond_mean) -# -# if useC: -# solver = solve_barrier_affine_C -# else: -# solver = solve_barrier_affine_py -# -# val, soln, hess = solver(conjugate_arg, -# prec_opt, -# observed_soln, -# linear_part, -# offset, -# **solve_args) -# -# final_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) \ -# + regress_target_score.dot(M1.dot(opt_linear)).dot(cond_mean - soln) - bias_target -# -# observed_info_natural = prec_target_nosel + T3 - T5.dot(hess.dot(T5.T)) -# -# unbiased_estimator = cov_target.dot(prec_target_nosel).dot(observed_target) - bias_target -# -# observed_info_mean = cov_target.dot(observed_info_natural.dot(cov_target)) -# -# Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) -# -# pvalues = ndist.cdf(Z_scores) -# -# pvalues = 2 * np.minimum(pvalues, 1 - pvalues) -# -# alpha = 1. - level -# -# quantile = ndist.ppf(1 - alpha / 2.) -# -# intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), -# final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T -# -# log_ref = val + conjugate_arg.T.dot(cond_cov).dot(conjugate_arg) / 2. -# -# result = pd.DataFrame({'MLE': final_estimator, -# 'SE': np.sqrt(np.diag(observed_info_mean)), -# 'Zvalue': Z_scores, -# 'pvalue': pvalues, -# 'lower_confidence': intervals[:, 0], -# 'upper_confidence': intervals[:, 1], -# 'unbiased': unbiased_estimator}) -# -# return result, observed_info_mean, log_ref diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 96ceae97b..c4ceab085 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -25,7 +25,7 @@ def __init__(self, self.cond_mean = query.cond_mean self.cond_cov = query.cond_cov - self.prec_opt = np.linalg.inv(self.cond_cov) + self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear self.linear_part = query.sampler.affine_con.linear_part @@ -42,14 +42,14 @@ def __init__(self, def mle_inference(self, useC= False, level=0.90): - conjugate_arg = self.prec_opt.dot(self.cond_mean) + conjugate_arg = self.cond_precision.dot(self.cond_mean) if useC: solver = solve_barrier_affine_C else: solver = solve_barrier_affine_py val, soln, hess = solver(conjugate_arg, - self.prec_opt, + self.cond_precision, self.observed_soln, self.linear_part, self.offset, From cd730d6d3f7d23c3d89735edad993e000a980a67 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 4 Nov 2021 21:26:33 -0400 Subject: [PATCH 161/187] some more name changes for variables --- selectinf/randomized/approx_reference.py | 41 +++++++++--------------- selectinf/randomized/exact_reference.py | 2 -- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 38483f4c6..2e9bac78b 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -33,42 +33,33 @@ def __init__(self, Arguments passed to solver. """ + self.solve_args = solve_args + (observed_target, cov_target, regress_target_score) = target_spec[:3] - - self.solve_args = solve_args - - linear_part = query.affine_con.linear_part - offset = query.affine_con.offset - - opt_linear = query.opt_linear - observed_score = query.observed_score_state + query.observed_subgrad - - result, inverse_info, log_ref = query.selective_MLE(target_spec) - - cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(cond_cov) - self.cond_cov = cond_cov + self.observed_target = observed_target self.cov_target = cov_target self.prec_target = np.linalg.inv(cov_target) - - self.observed_target = observed_target self.regress_target_score = regress_target_score - self.opt_linear = opt_linear - self.observed_score = observed_score + + self.cond_mean = query.cond_mean + self.cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(self.cond_cov) + self.opt_linear = query.opt_linear + + self.linear_part = query.affine_con.linear_part + self.offset = query.affine_con.offset self.M1 = query.M1 self.M2 = query.M2 self.M3 = query.M3 - self.feasible_point = query.observed_opt_state + self.observed_soln = query.observed_opt_state - self.cond_mean = query.cond_mean - self.linear_part = linear_part - self.offset = offset + self.observed_score = query.observed_score_state + query.observed_subgrad - self.feasible_point = query.observed_opt_state + result, inverse_info, log_ref = query.selective_MLE(target_spec) self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -88,7 +79,7 @@ def __init__(self, observed_target[j] + 1.5 * _scale[j], num=ngrid) - self.opt_linear = query.opt_linear + self.useIP = useIP self.inverse_info = inverse_info @@ -158,7 +149,7 @@ def _approx_log_reference(self, val, _, _ = solver(conjugate_arg, self.cond_precision, - self.feasible_point, + self.observed_soln, self.linear_part, self.offset, **self.solve_args) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 00702a6b7..291b640fd 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -26,8 +26,6 @@ def __init__(self, Estimated covaraince of target. cov_target_score : ndarray Estimated covariance of target and score of randomized query. - level : float, optional - Confidence level. solve_args : dict, optional Arguments passed to solver. """ From eb0d16e05e4d75290c21d8fbe91ca58f8c4a9454 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Fri, 5 Nov 2021 21:22:19 -0400 Subject: [PATCH 162/187] removed regress_opt from return list; some more consistency fixes --- selectinf/randomized/lasso.py | 11 +++++------ selectinf/randomized/query.py | 2 +- selectinf/randomized/selective_MLE.py | 8 ++++---- .../randomized/tests/test_selective_MLE_high.py | 13 +++++++------ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 1cca12f32..26fecf91e 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -693,7 +693,7 @@ def __init__(self, proportion_select, ridge_term=0, perturb=None, - estimate_dispersion=False): + estimate_dispersion=True): (self.loglike, self.feature_weights, @@ -731,7 +731,7 @@ def fit(self, 'func') / (n - df_fit)) - self.dispersion = dispersion + self.dispersion_ = dispersion # run setup again after # estimating dispersion @@ -741,13 +741,13 @@ def fit(self, def setup_inference(self, - dispersion=None): + dispersion): if self.df_fit > 0: if dispersion is None: self._setup_sampler(*self._setup_sampler_data, - dispersion=self.dispersion) + dispersion=self.dispersion_) else: self._setup_sampler(*self._setup_sampler_data, @@ -805,7 +805,6 @@ def _setup_implied_gaussian(self, return (cond_mean, cond_cov, cond_precision, - regress_opt, M1, M2, M3) @@ -1086,4 +1085,4 @@ def poisson(X, np.asarray(feature_weights), proportion) - + diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index adf218825..38ff6e957 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -185,7 +185,7 @@ def selective_MLE(self, target_spec, solve_args=solve_args) - return G.mle_inference(level=level) + return G.solve_estimating_eqn(level=level) def posterior(self, target_spec, diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index c4ceab085..ed62d60ca 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -28,8 +28,8 @@ def __init__(self, self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear - self.linear_part = query.sampler.affine_con.linear_part - self.offset = query.sampler.affine_con.offset + self.linear_part = query.affine_con.linear_part + self.offset = query.affine_con.offset self.M1 = query.M1 self.M2 = query.M2 @@ -40,7 +40,7 @@ def __init__(self, self._setup_estimating_eqn() - def mle_inference(self, useC= False, level=0.90): + def solve_estimating_eqn(self, useC= False, level=0.90): conjugate_arg = self.cond_precision.dot(self.cond_mean) if useC: @@ -59,7 +59,7 @@ def mle_inference(self, useC= False, level=0.90): + self.regress_target_score.dot(self.M1.dot(self.opt_linear)).dot(self.cond_mean - soln) \ - self.bias_target - observed_info_natural = self.prec_target_nosel + self.T3 - self.T5.dot(self.hess.dot(self.T5.T)) + observed_info_natural = self.prec_target_nosel + self.T3 - self.T5.dot(hess.dot(self.T5.T)) unbiased_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) - self.bias_target diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index 444748b8d..aeb2571fb 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -347,12 +347,12 @@ def test_logistic_split(n=2000, if nonzero.sum() > 0: - conv.setup_inference(dispersion=1) - target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) + conv.setup_inference(dispersion=None) + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] @@ -360,7 +360,7 @@ def test_logistic_split(n=2000, 'upper_confidence']]) return pval[beta[nonzero] == 0], pval[beta[nonzero] != 0], intervals - + def test_poisson(n=2000, p=200, signal_fac=10., @@ -398,12 +398,13 @@ def test_poisson(n=2000, print("dimensions", n, p, nonzero.sum()) if nonzero.sum() > 0: - conv.setup_inference(dispersion=1) target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) + conv.setup_inference(dispersion=1) + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] @@ -450,12 +451,12 @@ def test_poisson_split(n=2000, if nonzero.sum() > 0: - conv.setup_inference(dispersion=1) - target_spec = selected_targets(conv.loglike, conv.observed_soln, dispersion=1) + conv.setup_inference(dispersion=1) + result = conv.selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] From 4418bf7c8aff68fa1f82bc76b940fb86ce9eb328 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 16 Nov 2021 22:50:19 -0800 Subject: [PATCH 163/187] a little reorg -- one method for inference --- selectinf/randomized/approx_reference.py | 7 +- selectinf/randomized/exact_reference.py | 6 +- selectinf/randomized/posterior_inference.py | 4 +- selectinf/randomized/query.py | 184 +++++++++++------- .../tests/test_selective_MLE_high.py | 22 +-- 5 files changed, 136 insertions(+), 87 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 2e9bac78b..0491700e1 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -59,7 +59,8 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query.selective_MLE(target_spec) + result, inverse_info, log_ref = query._selective_MLE(target_spec, + solve_args=solve_args) self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -103,12 +104,12 @@ def summary(self, if parameter is not None: pivots = self.approx_pivots(parameter, - alternatives=alternatives) + alternatives=alternatives)[0] else: pivots = None pvalues = self._approx_pivots(np.zeros_like(self.observed_target), - alternatives=alternatives) + alternatives=alternatives)[0] lower, upper = self._approx_intervals(level=level) result = pd.DataFrame({'target': self.observed_target, diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 291b640fd..0818fba7c 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -26,11 +26,8 @@ def __init__(self, Estimated covaraince of target. cov_target_score : ndarray Estimated covariance of target and score of randomized query. - solve_args : dict, optional - Arguments passed to solver. """ - self.solve_args = solve_args (observed_target, cov_target, @@ -56,7 +53,8 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query.selective_MLE(target_spec) + result, inverse_info, log_ref = query._selective_MLE(target_spec, + solve_args=solve_args) self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 63b478dec..f4ab1698b 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -67,8 +67,8 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, self.inverse_info, log_ref = query.selective_MLE(target_spec) - + result, self.inverse_info, log_ref = query._selective_MLE(target_spec, + solve_args=solve_args) self.ntarget = self.cov_target.shape[0] self.nopt = self.cond_precision.shape[0] diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 38ff6e957..140f68f88 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1,7 +1,7 @@ -import numpy as np +import numpy as np, pandas as pd from ..constraints.affine import constraints -from .posterior_inference import posterior +from .posterior_inference import (posterior, langevin_sampler) from .approx_reference import approximate_grid_inference from .exact_reference import exact_grid_inference from .selective_MLE import mle_inference @@ -160,55 +160,72 @@ def _setup_implied_gaussian(self, M2, M3) - def selective_MLE(self, - target_spec, - level=0.90, - solve_args={'tol': 1.e-12}): + def inference(self, + target_spec, + method, + level=0.90, + method_args={}): """ Parameters - ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - solve_args : dict, optional - Arguments passed to solver. + ---------- + target_spec : TargetSpec + Information needed to specify the target. + method : str + One of ['selective_MLE', 'approx', 'exact', 'posterior'] + level : float + Confidence level or posterior quantiles. + method_args : dict + Dict of arguments to be optionally passed to the methods. + + Returns + ------- + + summary : pd.DataFrame + Statistical summary for specified targets. """ - G = mle_inference(self, - target_spec, - solve_args=solve_args) - - return G.solve_estimating_eqn(level=level) - + if method == 'selective_MLE': + return self._selective_MLE(target_spec, + level=level, + **method_args)[0] + elif method == 'exact': + return self._exact_grid_inference(target_spec, + level=level) # has no additional args + elif method == 'approx': + return self._approx_grid_inference(target_spec, + level=level, + **method_args) + elif method == 'posterior': + return self.posterior(target_spec, + **method_args)[1] + + def posterior(self, target_spec, + level=0.90, dispersion=1, prior=None, - solve_args={'tol': 1.e-12}): + solve_args={'tol': 1.e-12}, + nsample=2000, + nburnin=500): """ + Parameters ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. + target_spec : TargetSpec + Information needed to specify the target. + level : float + Level for credible interval. + dispersion : float, optional + Dispersion parameter for log-likelihood. prior : callable A callable object that takes a single argument `parameter` of the same shape as `observed_target` and returns (value of log prior, gradient of log prior) - dispersion : float, optional - Dispersion parameter for log-likelihood. solve_args : dict, optional Arguments passed to solver. + """ if prior is None: @@ -219,31 +236,67 @@ def prior(target_parameter): log_prior = -0.5 * np.sum(target_parameter ** 2 * Di) return log_prior, grad_prior - return posterior(self, - target_spec, - dispersion, - prior, - solve_args=solve_args) + posterior_repr = posterior(self, + target_spec, + dispersion, + prior, + solve_args=solve_args) + + samples = langevin_sampler(posterior_repr, + nsample=nsample, + nburnin=nburnin) + + delta = 0.5 * (1 - level) * 100 + lower = np.percentile(samples, delta, axis=0) + upper = np.percentile(samples, 100 - delta, axis=0) + mean = np.mean(samples, axis=0) + + return samples, pd.DataFrame({'estimate':mean, + 'lower_credible':lower, + 'upper_credible':upper}) + + # private methods + + def _selective_MLE(self, + target_spec, + level=0.90, + solve_args={'tol': 1.e-12}): + + """ + Parameters + ---------- + target_spec : TargetSpec + Information needed to specify the target. + level : float + Confidence level or posterior quantiles. + solve_args : dict + Dict of arguments to be optionally passed to solver. + """ + + G = mle_inference(self, + target_spec, + solve_args=solve_args) + + return G.solve_estimating_eqn(level=level) + - def approximate_grid_inference(self, - target_spec, - useIP=True, - solve_args={'tol': 1.e-12}): + def _approximate_grid_inference(self, + target_spec, + level=0.90, + solve_args={'tol': 1.e-12}, + useIP=True): """ Parameters ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] + target_spec : TargetSpec + Information needed to specify the target. + level : float + Confidence level or posterior quantiles. solve_args : dict, optional Arguments passed to solver. + useIP : bool + Use spline extrapolation. """ G = approximate_grid_inference(self, @@ -251,33 +304,30 @@ def approximate_grid_inference(self, solve_args=solve_args, useIP=useIP) - return G.summary(alternatives=target_spec.alternatives) + return G.summary(alternatives=target_spec.alternatives, + level=level) - def exact_grid_inference(self, - target_spec, - solve_args={'tol': 1.e-12}): + def _exact_grid_inference(self, + target_spec, + level=0.90, + solve_args={'tol': 1.e-12}): """ Parameters ---------- - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - regress_target_score : ndarray - Estimated covariance of target and score of randomized query. - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] + target_spec : TargetSpec + Information needed to specify the target. + level : float + Confidence level or posterior quantiles. solve_args : dict, optional Arguments passed to solver. """ G = exact_grid_inference(self, - target_spec, - solve_args=solve_args) + target_spec) - return G.summary(alternatives=target_spec.alternatives) + return G.summary(alternatives=target_spec.alternatives, + level=level) diff --git a/selectinf/randomized/tests/test_selective_MLE_high.py b/selectinf/randomized/tests/test_selective_MLE_high.py index aeb2571fb..d7aca0e34 100644 --- a/selectinf/randomized/tests/test_selective_MLE_high.py +++ b/selectinf/randomized/tests/test_selective_MLE_high.py @@ -78,7 +78,7 @@ def test_full_targets(n=200, conv.setup_inference(dispersion=dispersion) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] pval = result['pvalue'] estimate = result['MLE'] @@ -147,7 +147,7 @@ def test_selected_targets(n=2000, conv.observed_soln, dispersion=dispersion) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -245,7 +245,7 @@ def test_selected_targets_disperse(n=500, conv.observed_soln, dispersion=dispersion) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', 'upper_confidence']]) @@ -301,7 +301,7 @@ def test_logistic(n=2000, conv.observed_soln, dispersion=1) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -353,7 +353,7 @@ def test_logistic_split(n=2000, conv.setup_inference(dispersion=None) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -405,7 +405,7 @@ def test_poisson(n=2000, conv.setup_inference(dispersion=1) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -457,7 +457,7 @@ def test_poisson_split(n=2000, conv.setup_inference(dispersion=1) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -512,7 +512,7 @@ def test_cox(n=2000, conv.observed_soln, dispersion=1) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -567,7 +567,7 @@ def test_cox_split(n=2000, conv.observed_soln, dispersion=1) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] estimate = result['MLE'] pval = result['pvalue'] intervals = np.asarray(result[['lower_confidence', @@ -635,7 +635,7 @@ def test_scale_invariant_split(n=200, print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] print(result['MLE'] / scale) results.append(result) @@ -713,7 +713,7 @@ def test_scale_invariant(n=200, print('cov_target', target_spec.cov_target[0,0]/scale**2) print('regress_target_score', target_spec.regress_target_score[0,0]/scale**2) - result = conv.selective_MLE(target_spec)[0] + result = conv._selective_MLE(target_spec)[0] print(result['MLE'] / scale) results.append(result) From ffd89dda7479f0f1b88e504a97488eb627abb05a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 00:05:54 -0800 Subject: [PATCH 164/187] BF: fixing method name --- selectinf/randomized/query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 140f68f88..f40c6e4cf 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -193,9 +193,9 @@ def inference(self, return self._exact_grid_inference(target_spec, level=level) # has no additional args elif method == 'approx': - return self._approx_grid_inference(target_spec, - level=level, - **method_args) + return self._approximate_grid_inference(target_spec, + level=level, + **method_args) elif method == 'posterior': return self.posterior(target_spec, **method_args)[1] From e7a1c4a7a22e19256c8e11d7f305ecf8dc4b4f0d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 08:27:12 -0800 Subject: [PATCH 165/187] more cleanup, added QuerySpec named tuple --- selectinf/randomized/approx_reference.py | 14 +- selectinf/randomized/exact_reference.py | 15 +- selectinf/randomized/posterior_inference.py | 14 +- selectinf/randomized/query.py | 143 +++++++++----------- selectinf/randomized/selective_MLE.py | 69 +++++----- 5 files changed, 130 insertions(+), 125 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 0491700e1..bb069396e 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -5,7 +5,7 @@ from ..distributions.discrete_family import discrete_family from ..algorithms.barrier_affine import solve_barrier_affine_py - +from .selective_MLE import mle_inference class approximate_grid_inference(object): @@ -49,8 +49,8 @@ def __init__(self, self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear - self.linear_part = query.affine_con.linear_part - self.offset = query.affine_con.offset + self.linear_part = query.linear_part + self.offset = query.offset self.M1 = query.M1 self.M2 = query.M2 @@ -59,8 +59,11 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query._selective_MLE(target_spec, - solve_args=solve_args) + G = mle_inference(query, + target_spec, + solve_args=solve_args) + + _, inverse_info, log_ref = G.solve_estimating_eqn() self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -114,6 +117,7 @@ def summary(self, result = pd.DataFrame({'target': self.observed_target, 'pvalue': pvalues, + 'alternative': alternatives, 'lower_confidence': lower, 'upper_confidence': upper}) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 0818fba7c..5d9ba19a6 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -4,6 +4,7 @@ from scipy.stats import norm as ndist from ..distributions.discrete_family import discrete_family +from .selective_MLE import mle_inference class exact_grid_inference(object): @@ -43,8 +44,8 @@ def __init__(self, self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear - self.linear_part = query.affine_con.linear_part - self.offset = query.affine_con.offset + self.linear_part = query.linear_part + self.offset = query.offset self.M1 = query.M1 self.M2 = query.M2 @@ -53,8 +54,11 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, inverse_info, log_ref = query._selective_MLE(target_spec, - solve_args=solve_args) + G = mle_inference(query, + target_spec, + solve_args=solve_args) + + _, inverse_info, log_ref = G.solve_estimating_eqn() self.ntarget = ntarget = cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) @@ -88,7 +92,7 @@ def summary(self, if parameter is not None: pivots = self._pivots(parameter, - alternatives=alternatives) + alternatives=alternatives) else: pivots = None @@ -98,6 +102,7 @@ def summary(self, result = pd.DataFrame({'target': self.observed_target, 'pvalue': pvalues, + 'alternative': alternatives, 'lower_confidence': lower, 'upper_confidence': upper}) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index f4ab1698b..0c33f3b96 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -7,7 +7,7 @@ from scipy.linalg import fractional_matrix_power from ..algorithms.barrier_affine import solve_barrier_affine_py - +from .selective_MLE import mle_inference class PosteriorAtt(typing.NamedTuple): @@ -57,8 +57,8 @@ def __init__(self, self.cond_precision = np.linalg.inv(self.cond_cov) self.opt_linear = query.opt_linear - self.linear_part = query.affine_con.linear_part - self.offset = query.affine_con.offset + self.linear_part = query.linear_part + self.offset = query.offset self.M1 = query.M1 self.M2 = query.M2 @@ -67,8 +67,11 @@ def __init__(self, self.observed_score = query.observed_score_state + query.observed_subgrad - result, self.inverse_info, log_ref = query._selective_MLE(target_spec, - solve_args=solve_args) + G = mle_inference(query, + target_spec, + solve_args=solve_args) + + result, self.inverse_info, self.log_ref = G.solve_estimating_eqn() self.ntarget = self.cov_target.shape[0] self.nopt = self.cond_precision.shape[0] @@ -76,7 +79,6 @@ def __init__(self, self.initial_estimate = np.asarray(result['MLE']) self.dispersion = dispersion - self.log_ref = log_ref ### Note for an informative prior we might want to change this... self.prior = prior diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index f40c6e4cf..594b08312 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -1,3 +1,4 @@ +from typing import NamedTuple import numpy as np, pandas as pd from ..constraints.affine import constraints @@ -6,6 +7,36 @@ from .exact_reference import exact_grid_inference from .selective_MLE import mle_inference +class QuerySpec(NamedTuple): + + # law of o|S,u + + cond_mean : np.ndarray + cond_cov : np.ndarray + + # how S enters into E[o|S,u] + + opt_linear : np.ndarray + + # constraints + + linear_part : np.ndarray + offset : np.ndarray + + # score / randomization relationship + + M1 : np.ndarray + M2 : np.ndarray + M3 : np.ndarray + + # observed values + + observed_opt_state : np.ndarray + observed_score_state : np.ndarray + observed_subgrad : np.ndarray + observed_soln : np.ndarray + observed_score : np.ndarray + class query(object): r""" This class is the base of randomized selective inference @@ -185,17 +216,43 @@ def inference(self, Statistical summary for specified targets. """ + query_spec = QuerySpec(cond_mean=self.cond_mean, + cond_cov=self.cond_cov, + opt_linear=self.opt_linear, + linear_part=self.affine_con.linear_part, + offset=self.affine_con.offset, + M1=self.M1, + M2=self.M2, + M3=self.M3, + observed_opt_state=self.observed_opt_state, + observed_score_state=self.observed_score_state, + observed_subgrad=self.observed_subgrad, + observed_soln=self.observed_opt_state, + observed_score=self.observed_score_state + self.observed_subgrad) + if method == 'selective_MLE': - return self._selective_MLE(target_spec, - level=level, - **method_args)[0] + G = mle_inference(query_spec, + target_spec, + **method_args) + + return G.solve_estimating_eqn(alternatives=target_spec.alternatives, + level=level)[0] + elif method == 'exact': - return self._exact_grid_inference(target_spec, - level=level) # has no additional args + G = exact_grid_inference(query_spec, + target_spec) + + return G.summary(alternatives=target_spec.alternatives, + level=level) + elif method == 'approx': - return self._approximate_grid_inference(target_spec, - level=level, - **method_args) + G = approximate_grid_inference(query_spec, + target_spec, + **method_args) + + return G.summary(alternatives=target_spec.alternatives, + level=level) + elif method == 'posterior': return self.posterior(target_spec, **method_args)[1] @@ -255,79 +312,9 @@ def prior(target_parameter): 'lower_credible':lower, 'upper_credible':upper}) - # private methods - - def _selective_MLE(self, - target_spec, - level=0.90, - solve_args={'tol': 1.e-12}): - - """ - Parameters - ---------- - target_spec : TargetSpec - Information needed to specify the target. - level : float - Confidence level or posterior quantiles. - solve_args : dict - Dict of arguments to be optionally passed to solver. - """ - - G = mle_inference(self, - target_spec, - solve_args=solve_args) - return G.solve_estimating_eqn(level=level) - - - def _approximate_grid_inference(self, - target_spec, - level=0.90, - solve_args={'tol': 1.e-12}, - useIP=True): - - """ - Parameters - ---------- - target_spec : TargetSpec - Information needed to specify the target. - level : float - Confidence level or posterior quantiles. - solve_args : dict, optional - Arguments passed to solver. - useIP : bool - Use spline extrapolation. - """ - - G = approximate_grid_inference(self, - target_spec, - solve_args=solve_args, - useIP=useIP) - - return G.summary(alternatives=target_spec.alternatives, - level=level) - - def _exact_grid_inference(self, - target_spec, - level=0.90, - solve_args={'tol': 1.e-12}): - - """ - Parameters - ---------- - target_spec : TargetSpec - Information needed to specify the target. - level : float - Confidence level or posterior quantiles. - solve_args : dict, optional - Arguments passed to solver. - """ - G = exact_grid_inference(self, - target_spec) - return G.summary(alternatives=target_spec.alternatives, - level=level) diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index ed62d60ca..757d55efe 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -8,7 +8,7 @@ class mle_inference(object): def __init__(self, - query, + query_spec, target_spec, solve_args={'tol': 1.e-12}): @@ -23,40 +23,33 @@ def __init__(self, self.prec_target = np.linalg.inv(cov_target) self.regress_target_score = regress_target_score - self.cond_mean = query.cond_mean - self.cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(self.cond_cov) - self.opt_linear = query.opt_linear - - self.linear_part = query.affine_con.linear_part - self.offset = query.affine_con.offset - - self.M1 = query.M1 - self.M2 = query.M2 - self.M3 = query.M3 - self.observed_soln = query.observed_opt_state - - self.observed_score = query.observed_score_state + query.observed_subgrad + self.query_spec = query_spec self._setup_estimating_eqn() - def solve_estimating_eqn(self, useC= False, level=0.90): + def solve_estimating_eqn(self, + alternatives=None, + useC=False, + level=0.90): + + Q = self.query_spec + cond_precision = np.linalg.inv(Q.cond_cov) + conjugate_arg = cond_precision.dot(Q.cond_mean) - conjugate_arg = self.cond_precision.dot(self.cond_mean) if useC: solver = solve_barrier_affine_C else: solver = solve_barrier_affine_py val, soln, hess = solver(conjugate_arg, - self.cond_precision, - self.observed_soln, - self.linear_part, - self.offset, + cond_precision, + Q.observed_soln, + Q.linear_part, + Q.offset, **self.solve_args) final_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) \ - + self.regress_target_score.dot(self.M1.dot(self.opt_linear)).dot(self.cond_mean - soln) \ + + self.regress_target_score.dot(Q.M1.dot(Q.opt_linear)).dot(Q.cond_mean - soln) \ - self.bias_target observed_info_natural = self.prec_target_nosel + self.T3 - self.T5.dot(hess.dot(self.T5.T)) @@ -67,9 +60,21 @@ def solve_estimating_eqn(self, useC= False, level=0.90): Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) - pvalues = ndist.cdf(Z_scores) + cdf_vals = ndist.cdf(Z_scores) + pvalues = [] + + if alternatives is None: + alternatives = ['twosided'] * len(cdf_vals) - pvalues = 2 * np.minimum(pvalues, 1 - pvalues) + for m, _cdf in enumerate(cdf_vals): + if alternatives[m] == 'twosided': + pvalues.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pvalues.append(1 - _cdf) + elif alternatives[m] == 'less': + pvalues.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') alpha = 1. - level @@ -78,12 +83,13 @@ def solve_estimating_eqn(self, useC= False, level=0.90): intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - log_ref = val + conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2. + log_ref = val + conjugate_arg.T.dot(Q.cond_cov).dot(conjugate_arg) / 2. result = pd.DataFrame({'MLE': final_estimator, 'SE': np.sqrt(np.diag(observed_info_mean)), 'Zvalue': Z_scores, 'pvalue': pvalues, + 'alternative': alternatives, 'lower_confidence': intervals[:, 0], 'upper_confidence': intervals[:, 1], 'unbiased': unbiased_estimator}) @@ -92,18 +98,19 @@ def solve_estimating_eqn(self, useC= False, level=0.90): def _setup_estimating_eqn(self): + Q = self.query_spec T1 = self.regress_target_score.T.dot(self.prec_target) - T2 = T1.T.dot(self.M2.dot(T1)) - T3 = T1.T.dot(self.M3.dot(T1)) - T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + T2 = T1.T.dot(Q.M2.dot(T1)) + T3 = T1.T.dot(Q.M3.dot(T1)) + T4 = Q.M1.dot(Q.opt_linear).dot(Q.cond_cov).dot(Q.opt_linear.T.dot(Q.M1.T.dot(T1))) + T5 = T1.T.dot(Q.M1.dot(Q.opt_linear)) self.prec_target_nosel = self.prec_target + T2 - T3 - _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(self.observed_target)) + _P = -(T1.T.dot(Q.M1.dot(Q.observed_score)) + T2.dot(self.observed_target)) self.bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) - + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + + Q.M1.dot(Q.opt_linear.dot(Q.cond_mean))) - _P) self.T3 = T3 self.T5 = T5 From a7704a4bf9265ee0007632faeb90d22d65af0922 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 08:38:28 -0800 Subject: [PATCH 166/187] more cleanup of selective_MLE --- selectinf/randomized/selective_MLE.py | 61 +++++++++++++-------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 757d55efe..9fc302b25 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -12,27 +12,21 @@ def __init__(self, target_spec, solve_args={'tol': 1.e-12}): - self.solve_args = solve_args - - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - self.observed_target = observed_target - self.cov_target = cov_target - self.prec_target = np.linalg.inv(cov_target) - self.regress_target_score = regress_target_score - self.query_spec = query_spec - - self._setup_estimating_eqn() - + self.target_spec = target_spec + self.solve_args = solve_args + def solve_estimating_eqn(self, alternatives=None, useC=False, level=0.90): + prec_target_nosel, bias_target, U3, U5 = _setup_estimating_eqn(self.query_spec, + self.target_spec) + Q = self.query_spec + TS = self.target_spec + cond_precision = np.linalg.inv(Q.cond_cov) conjugate_arg = cond_precision.dot(Q.cond_mean) @@ -48,15 +42,15 @@ def solve_estimating_eqn(self, Q.offset, **self.solve_args) - final_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) \ - + self.regress_target_score.dot(Q.M1.dot(Q.opt_linear)).dot(Q.cond_mean - soln) \ - - self.bias_target + final_estimator = TS.cov_target.dot(prec_target_nosel).dot(TS.observed_target) \ + + TS.regress_target_score.dot(Q.M1.dot(Q.opt_linear)).dot(Q.cond_mean - soln) \ + - bias_target - observed_info_natural = self.prec_target_nosel + self.T3 - self.T5.dot(hess.dot(self.T5.T)) + observed_info_natural = prec_target_nosel + U3 - U5.dot(hess.dot(U5.T)) - unbiased_estimator = self.cov_target.dot(self.prec_target_nosel).dot(self.observed_target) - self.bias_target + unbiased_estimator = TS.cov_target.dot(prec_target_nosel).dot(TS.observed_target) - bias_target - observed_info_mean = self.cov_target.dot(observed_info_natural.dot(self.cov_target)) + observed_info_mean = TS.cov_target.dot(observed_info_natural.dot(TS.cov_target)) Z_scores = final_estimator / np.sqrt(np.diag(observed_info_mean)) @@ -96,24 +90,27 @@ def solve_estimating_eqn(self, return result, observed_info_mean, log_ref - def _setup_estimating_eqn(self): +def _setup_estimating_eqn(query_spec, + target_spec): - Q = self.query_spec - T1 = self.regress_target_score.T.dot(self.prec_target) - T2 = T1.T.dot(Q.M2.dot(T1)) - T3 = T1.T.dot(Q.M3.dot(T1)) - T4 = Q.M1.dot(Q.opt_linear).dot(Q.cond_cov).dot(Q.opt_linear.T.dot(Q.M1.T.dot(T1))) - T5 = T1.T.dot(Q.M1.dot(Q.opt_linear)) + Q = query_spec + TS = target_spec + + prec_target = np.linalg.inv(TS.cov_target) + U1 = TS.regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(Q.M2.dot(U1)) + U3 = U1.T.dot(Q.M3.dot(U1)) + U4 = Q.M1.dot(Q.opt_linear).dot(Q.cond_cov).dot(Q.opt_linear.T.dot(Q.M1.T.dot(U1))) + U5 = U1.T.dot(Q.M1.dot(Q.opt_linear)) - self.prec_target_nosel = self.prec_target + T2 - T3 + prec_target_nosel = prec_target + U2 - U3 - _P = -(T1.T.dot(Q.M1.dot(Q.observed_score)) + T2.dot(self.observed_target)) + _P = -(U1.T.dot(Q.M1.dot(Q.observed_score)) + U2.dot(TS.observed_target)) - self.bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) + Q.M1.dot(Q.opt_linear.dot(Q.cond_mean))) - _P) - self.T3 = T3 - self.T5 = T5 + return prec_target_nosel, bias_target, U3, U5 From 7bbcf9012bd50bb0aeeea394d44a6606af372085 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 12:20:22 -0800 Subject: [PATCH 167/187] specification as a property; use QS instead of Q; standardizing grid_inference --- selectinf/randomized/approx_reference.py | 345 ++++++++++++----------- selectinf/randomized/base.py | 294 +++++++++++++++++++ selectinf/randomized/exact_reference.py | 5 +- selectinf/randomized/query.py | 31 +- 4 files changed, 491 insertions(+), 184 deletions(-) create mode 100644 selectinf/randomized/base.py diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index bb069396e..588838cfc 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -5,12 +5,12 @@ from ..distributions.discrete_family import discrete_family from ..algorithms.barrier_affine import solve_barrier_affine_py -from .selective_MLE import mle_inference +from .base import grid_inference -class approximate_grid_inference(object): +class approximate_grid_inference(grid_inference): def __init__(self, - query, + query_spec, target_spec, solve_args={'tol': 1.e-12}, useIP=False): @@ -33,49 +33,12 @@ def __init__(self, Arguments passed to solver. """ - self.solve_args = solve_args + grid_inference.__init__(self, + query_spec, + target_spec, + solve_args=solve_args) - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - self.observed_target = observed_target - self.cov_target = cov_target - self.prec_target = np.linalg.inv(cov_target) - self.regress_target_score = regress_target_score - - self.cond_mean = query.cond_mean - self.cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(self.cond_cov) - self.opt_linear = query.opt_linear - - self.linear_part = query.linear_part - self.offset = query.offset - - self.M1 = query.M1 - self.M2 = query.M2 - self.M3 = query.M3 - self.observed_soln = query.observed_opt_state - - self.observed_score = query.observed_score_state + query.observed_subgrad - - G = mle_inference(query, - target_spec, - solve_args=solve_args) - - _, inverse_info, log_ref = G.solve_estimating_eqn() - - self.ntarget = ntarget = cov_target.shape[0] - _scale = 4 * np.sqrt(np.diag(inverse_info)) - - if useIP == False: - ngrid = 1000 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) - else: + if useIP: ngrid = 60 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): @@ -85,47 +48,105 @@ def __init__(self, self.useIP = useIP - self.inverse_info = inverse_info - - def summary(self, - alternatives=None, - parameter=None, - level=0.9): - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - level : float - Confidence level. - """ - - if parameter is not None: - pivots = self.approx_pivots(parameter, - alternatives=alternatives)[0] - else: - pivots = None - - pvalues = self._approx_pivots(np.zeros_like(self.observed_target), - alternatives=alternatives)[0] - lower, upper = self._approx_intervals(level=level) - result = pd.DataFrame({'target': self.observed_target, - 'pvalue': pvalues, - 'alternative': alternatives, - 'lower_confidence': lower, - 'upper_confidence': upper}) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result + # self.useIP = useIP + # self.query_spec = query_spec + # self.target_spec = target_spec + # query = query_spec + + # self.solve_args = solve_args + + # (observed_target, + # cov_target, + # regress_target_score) = target_spec[:3] + + # self.observed_target = observed_target + # self.cov_target = cov_target + # self.prec_target = np.linalg.inv(cov_target) + # self.regress_target_score = regress_target_score + + # self.cond_mean = query.cond_mean + # self.cond_cov = query.cond_cov + # self.cond_precision = np.linalg.inv(self.cond_cov) + # self.opt_linear = query.opt_linear + + # self.linear_part = query.linear_part + # self.offset = query.offset + + # self.M1 = query.M1 + # self.M2 = query.M2 + # self.M3 = query.M3 + # self.observed_soln = query.observed_opt_state + + # self.observed_score = query.observed_score_state + query.observed_subgrad + + # G = mle_inference(query, + # target_spec, + # solve_args=solve_args) + + # _, inverse_info, log_ref = G.solve_estimating_eqn() + + # self.ntarget = ntarget = cov_target.shape[0] + # _scale = 4 * np.sqrt(np.diag(inverse_info)) + + # if useIP == False: + # ngrid = 1000 + # self.stat_grid = np.zeros((ntarget, ngrid)) + # for j in range(ntarget): + # self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + # observed_target[j] + 1.5 * _scale[j], + # num=ngrid) + # else: + # ngrid = 60 + # self.stat_grid = np.zeros((ntarget, ngrid)) + # for j in range(ntarget): + # self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + # observed_target[j] + 1.5 * _scale[j], + # num=ngrid) + + + # self.useIP = useIP + + + # def summary(self, + # alternatives=None, + # parameter=None, + # level=0.9): + # """ + # Produce p-values and confidence intervals for targets + # of model including selected features + # Parameters + # ---------- + # alternatives : [str], optional + # Sequence of strings describing the alternatives, + # should be values of ['twosided', 'less', 'greater'] + # parameter : np.array + # Hypothesized value for parameter -- defaults to 0. + # level : float + # Confidence level. + # """ + + # if parameter is not None: + # pivots = self._pivots(parameter, + # alternatives=alternatives) + # else: + # pivots = None + + # pvalues = self._pivots(np.zeros_like(self.observed_target), + # alternatives=alternatives) + # lower, upper = self._intervals(level=level) + + # result = pd.DataFrame({'target': self.observed_target, + # 'pvalue': pvalues, + # 'alternative': alternatives, + # 'lower_confidence': lower, + # 'upper_confidence': upper}) + + # if not np.all(parameter == 0): + # result.insert(4, 'pivot', pivots) + # result.insert(5, 'parameter', parameter) + + # return result def _approx_log_reference(self, observed_target, @@ -206,116 +227,102 @@ def _construct_families(self): np.exp(logW))) self._log_ref = _log_ref - # construction of families follows `selectinf.learning.core` - - # logG = - 0.5 * grid**2 / var_target - # logG -= logG.max() - # import matplotlib.pyplot as plt - - # plt.plot(self.stat_grid[m][10:30], approx_log_ref[10:30]) - # plt.plot(self.stat_grid[m][:10], approx_log_ref[:10], 'r', linewidth=4) - # plt.plot(self.stat_grid[m][30:], approx_log_ref[30:], 'r', linewidth=4) - # plt.plot(self.stat_grid[m]*1.5, fapprox(self.stat_grid[m]*1.5), 'k--') - # plt.show() - # plt.plot(grid, logW) - # plt.plot(grid, logG) + # def _pivots(self, + # mean_parameter, + # alternatives=None): - def _approx_pivots(self, - mean_parameter, - alternatives=None): + # if not hasattr(self, "_families"): + # self._construct_families() - if not hasattr(self, "_families"): - self._construct_families() + # if alternatives is None: + # alternatives = ['twosided'] * self.ntarget - if alternatives is None: - alternatives = ['twosided'] * self.ntarget + # pivot = [] - pivot = [] + # for m in range(self.ntarget): - for m in range(self.ntarget): + # family = self._families[m] + # var_target = 1. / ((self.precs[m])[0, 0]) - family = self._families[m] - var_target = 1. / ((self.precs[m])[0, 0]) + # mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] + # # construction of pivot from families follows `selectinf.learning.core` - mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] - # construction of pivot from families follows `selectinf.learning.core` + # _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) - _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) + # if alternatives[m] == 'twosided': + # pivot.append(2 * min(_cdf, 1 - _cdf)) + # elif alternatives[m] == 'greater': + # pivot.append(1 - _cdf) + # elif alternatives[m] == 'less': + # pivot.append(_cdf) + # else: + # raise ValueError('alternative should be in ["twosided", "less", "greater"]') + # return pivot # , self._log_ref - if alternatives[m] == 'twosided': - pivot.append(2 * min(_cdf, 1 - _cdf)) - elif alternatives[m] == 'greater': - pivot.append(1 - _cdf) - elif alternatives[m] == 'less': - pivot.append(_cdf) - else: - raise ValueError('alternative should be in ["twosided", "less", "greater"]') - return pivot, self._log_ref + # def _intervals(self, + # level=0.9): - def _approx_intervals(self, - level=0.9): + # if not hasattr(self, "_families"): + # self._construct_families() - if not hasattr(self, "_families"): - self._construct_families() + # lower, upper = [], [] - lower, upper = [], [] + # for m in range(self.ntarget): + # # construction of intervals from families follows `selectinf.learning.core` + # family = self._families[m] + # observed_target = self.observed_target[m] - for m in range(self.ntarget): - # construction of intervals from families follows `selectinf.learning.core` - family = self._families[m] - observed_target = self.observed_target[m] + # l, u = family.equal_tailed_interval(observed_target, + # alpha=1 - level) - l, u = family.equal_tailed_interval(observed_target, - alpha=1 - level) + # var_target = 1. / ((self.precs[m])[0, 0]) - var_target = 1. / ((self.precs[m])[0, 0]) + # lower.append(l * var_target + observed_target) + # upper.append(u * var_target + observed_target) - lower.append(l * var_target + observed_target) - upper.append(u * var_target + observed_target) + # return np.asarray(lower), np.asarray(upper) - return np.asarray(lower), np.asarray(upper) + # ### Private method + # def _construct_density(self): - ### Private method - def _construct_density(self): + # precs = {} + # S = {} + # r = {} + # T = {} - precs = {} - S = {} - r = {} - T = {} + # p = self.regress_target_score.shape[1] - p = self.regress_target_score.shape[1] - - for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - prec_target = 1. / cov_target_uni - regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) + # for m in range(self.ntarget): + # observed_target_uni = (self.observed_target[m]).reshape((1,)) + # cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + # prec_target = 1. / cov_target_uni + # regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) - T1 = regress_target_score_uni.T.dot(prec_target) - T2 = T1.T.dot(self.M2.dot(T1)) - T3 = T1.T.dot(self.M3.dot(T1)) - T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + # T1 = regress_target_score_uni.T.dot(prec_target) + # T2 = T1.T.dot(self.M2.dot(T1)) + # T3 = T1.T.dot(self.M3.dot(T1)) + # T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + # T5 = T1.T.dot(self.M1.dot(self.opt_linear)) - _T = self.cond_cov.dot(T5.T) + # _T = self.cond_cov.dot(T5.T) - prec_target_nosel = prec_target + T2 - T3 + # prec_target_nosel = prec_target + T2 - T3 - _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) + # _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) - bias_target = cov_target_uni.dot( - T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + # bias_target = cov_target_uni.dot( + # T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) - _S = np.linalg.inv(prec_target_nosel).dot(prec_target) + # _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + # _S = np.linalg.inv(prec_target_nosel).dot(prec_target) - S[m] = _S - r[m] = _r - precs[m] = prec_target_nosel - T[m] = _T + # S[m] = _S + # r[m] = _r + # precs[m] = prec_target_nosel + # T[m] = _T - self.precs = precs - self.S = S - self.r = r - self.T = T + # self.precs = precs + # self.S = S + # self.r = r + # self.T = T diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py new file mode 100644 index 000000000..3f19eaeb2 --- /dev/null +++ b/selectinf/randomized/base.py @@ -0,0 +1,294 @@ +import numpy as np, pandas as pd + +from .selective_MLE import mle_inference + +class grid_inference(object): + + def __init__(self, + query_spec, + target_spec, + solve_args={'tol': 1.e-12}): + + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + query : `gaussian_query` + A Gaussian query which has information + to describe implied Gaussian. + observed_target : ndarray + Observed estimate of target. + cov_target : ndarray + Estimated covaraince of target. + cov_target_score : ndarray + Estimated covariance of target and score of randomized query. + solve_args : dict, optional + Arguments passed to solver. + """ + + self.query_spec = query_spec + self.target_spec = target_spec + query = query_spec + + self.solve_args = solve_args + + (observed_target, + cov_target, + regress_target_score) = target_spec[:3] + + self.observed_target = observed_target + self.cov_target = cov_target + self.prec_target = np.linalg.inv(cov_target) + self.regress_target_score = regress_target_score + + self.cond_mean = query.cond_mean + self.cond_cov = query.cond_cov + self.cond_precision = np.linalg.inv(self.cond_cov) + self.opt_linear = query.opt_linear + + self.linear_part = query.linear_part + self.offset = query.offset + + self.M1 = query.M1 + self.M2 = query.M2 + self.M3 = query.M3 + self.observed_soln = query.observed_opt_state + + self.observed_score = query.observed_score_state + query.observed_subgrad + + G = mle_inference(query, + target_spec, + solve_args=solve_args) + + _, inverse_info, log_ref = G.solve_estimating_eqn() + + self.ntarget = ntarget = cov_target.shape[0] + _scale = 4 * np.sqrt(np.diag(inverse_info)) + self.inverse_info = inverse_info + + ngrid = 1000 + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], + observed_target[j] + 1.5 * _scale[j], + num=ngrid) + + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + level : float + Confidence level. + """ + + if parameter is not None: + pivots = self._pivots(parameter, + alternatives=alternatives) + else: + pivots = None + + pvalues = self._pivots(np.zeros_like(self.observed_target), + alternatives=alternatives) + lower, upper = self._intervals(level=level) + + result = pd.DataFrame({'target': self.observed_target, + 'pvalue': pvalues, + 'alternative': alternatives, + 'lower_confidence': lower, + 'upper_confidence': upper}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result + + def _approx_log_reference(self, + observed_target, + cov_target, + linear_coef, + grid): + + """ + Approximate the log of the reference density on a grid. + """ + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + ref_hat = [] + solver = solve_barrier_affine_py + + for k in range(grid.shape[0]): + # in the usual D = N + Gamma theta.hat, + # regress_opt_target is "something" times Gamma, + # where "something" comes from implied Gaussian + # cond_mean is "something" times D + # Gamma is cov_target_score.T.dot(prec_target) + + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) + conjugate_arg = self.cond_precision.dot(cond_mean_grid) + + val, _, _ = solver(conjugate_arg, + self.cond_precision, + self.observed_soln, + self.linear_part, + self.offset, + **self.solve_args) + + ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) + + return np.asarray(ref_hat) + + def _construct_families(self): + + self._construct_density() + + self._families = [] + _log_ref = np.zeros((self.ntarget, 1000)) + for m in range(self.ntarget): + + observed_target_uni = (self.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + + var_target = 1. / ((self.precs[m])[0, 0]) + + approx_log_ref = self._approx_log_reference(observed_target_uni, + cov_target_uni, + self.T[m], + self.stat_grid[m]) + + if self.useIP == False: + + logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW -= logW.max() + _log_ref[m,:] = logW + self._families.append(discrete_family(self.stat_grid[m], + np.exp(logW))) + else: + + approx_fn = interp1d(self.stat_grid[m], + approx_log_ref, + kind='quadratic', + bounds_error=False, + fill_value='extrapolate') + + grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + logW = (approx_fn(grid) - + 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + + logW -= logW.max() + _log_ref[m, :] = logW + self._families.append(discrete_family(grid, + np.exp(logW))) + + self._log_ref = _log_ref + + def _pivots(self, + mean_parameter, + alternatives=None): + + if not hasattr(self, "_families"): + self._construct_families() + + if alternatives is None: + alternatives = ['twosided'] * self.ntarget + + pivot = [] + + for m in range(self.ntarget): + + family = self._families[m] + var_target = 1. / ((self.precs[m])[0, 0]) + + mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] + # construction of pivot from families follows `selectinf.learning.core` + + _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) + + if alternatives[m] == 'twosided': + pivot.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pivot.append(1 - _cdf) + elif alternatives[m] == 'less': + pivot.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') + return pivot # , self._log_ref + + def _intervals(self, + level=0.9): + + if not hasattr(self, "_families"): + self._construct_families() + + lower, upper = [], [] + + for m in range(self.ntarget): + # construction of intervals from families follows `selectinf.learning.core` + family = self._families[m] + observed_target = self.observed_target[m] + + l, u = family.equal_tailed_interval(observed_target, + alpha=1 - level) + + var_target = 1. / ((self.precs[m])[0, 0]) + + lower.append(l * var_target + observed_target) + upper.append(u * var_target + observed_target) + + return np.asarray(lower), np.asarray(upper) + + ### Private method + def _construct_density(self): + + precs = {} + S = {} + r = {} + T = {} + + p = self.regress_target_score.shape[1] + + for m in range(self.ntarget): + observed_target_uni = (self.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + prec_target = 1. / cov_target_uni + regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) + + T1 = regress_target_score_uni.T.dot(prec_target) + T2 = T1.T.dot(self.M2.dot(T1)) + T3 = T1.T.dot(self.M3.dot(T1)) + T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) + T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + + _T = self.cond_cov.dot(T5.T) + + prec_target_nosel = prec_target + T2 - T3 + + _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) + + bias_target = cov_target_uni.dot( + T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + + _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + _S = np.linalg.inv(prec_target_nosel).dot(prec_target) + + S[m] = _S + r[m] = _r + precs[m] = prec_target_nosel + T[m] = _T + + self.precs = precs + self.S = S + self.r = r + self.T = T diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 5d9ba19a6..ccc76e37f 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -5,8 +5,9 @@ from ..distributions.discrete_family import discrete_family from .selective_MLE import mle_inference +from .base import grid_inference -class exact_grid_inference(object): +class exact_grid_inference(grid_inference): def __init__(self, query, @@ -97,7 +98,7 @@ def summary(self, pivots = None pvalues = self._pivots(np.zeros_like(self.observed_target), - alternatives=alternatives) + alternatives=alternatives) lower, upper = self._intervals(level=level) result = pd.DataFrame({'target': self.observed_target, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 594b08312..bd3a1bcd1 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -67,6 +67,23 @@ def __init__(self, randomization, perturb=None): self._randomized = False self._setup = False + @property + def specification(self): + return QuerySpec(cond_mean=self.cond_mean, + cond_cov=self.cond_cov, + opt_linear=self.opt_linear, + linear_part=self.affine_con.linear_part, + offset=self.affine_con.offset, + M1=self.M1, + M2=self.M2, + M3=self.M3, + observed_opt_state=self.observed_opt_state, + observed_score_state=self.observed_score_state, + observed_subgrad=self.observed_subgrad, + observed_soln=self.observed_opt_state, + observed_score=self.observed_score_state + self.observed_subgrad) + + # Methods reused by subclasses def randomize(self, perturb=None): @@ -216,19 +233,7 @@ def inference(self, Statistical summary for specified targets. """ - query_spec = QuerySpec(cond_mean=self.cond_mean, - cond_cov=self.cond_cov, - opt_linear=self.opt_linear, - linear_part=self.affine_con.linear_part, - offset=self.affine_con.offset, - M1=self.M1, - M2=self.M2, - M3=self.M3, - observed_opt_state=self.observed_opt_state, - observed_score_state=self.observed_score_state, - observed_subgrad=self.observed_subgrad, - observed_soln=self.observed_opt_state, - observed_score=self.observed_score_state + self.observed_subgrad) + query_spec = self.specification if method == 'selective_MLE': G = mle_inference(query_spec, From 559bc96bee16942564eaec98f8d0748db53d0996 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 12:41:00 -0800 Subject: [PATCH 168/187] simplifying grid methods --- selectinf/randomized/approx_reference.py | 225 ++-------------------- selectinf/randomized/base.py | 74 +++---- selectinf/randomized/exact_reference.py | 234 ++--------------------- selectinf/randomized/selective_MLE.py | 30 +-- 4 files changed, 82 insertions(+), 481 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 588838cfc..e7d2df42f 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -49,105 +49,6 @@ def __init__(self, self.useIP = useIP - # self.useIP = useIP - # self.query_spec = query_spec - # self.target_spec = target_spec - # query = query_spec - - # self.solve_args = solve_args - - # (observed_target, - # cov_target, - # regress_target_score) = target_spec[:3] - - # self.observed_target = observed_target - # self.cov_target = cov_target - # self.prec_target = np.linalg.inv(cov_target) - # self.regress_target_score = regress_target_score - - # self.cond_mean = query.cond_mean - # self.cond_cov = query.cond_cov - # self.cond_precision = np.linalg.inv(self.cond_cov) - # self.opt_linear = query.opt_linear - - # self.linear_part = query.linear_part - # self.offset = query.offset - - # self.M1 = query.M1 - # self.M2 = query.M2 - # self.M3 = query.M3 - # self.observed_soln = query.observed_opt_state - - # self.observed_score = query.observed_score_state + query.observed_subgrad - - # G = mle_inference(query, - # target_spec, - # solve_args=solve_args) - - # _, inverse_info, log_ref = G.solve_estimating_eqn() - - # self.ntarget = ntarget = cov_target.shape[0] - # _scale = 4 * np.sqrt(np.diag(inverse_info)) - - # if useIP == False: - # ngrid = 1000 - # self.stat_grid = np.zeros((ntarget, ngrid)) - # for j in range(ntarget): - # self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - # observed_target[j] + 1.5 * _scale[j], - # num=ngrid) - # else: - # ngrid = 60 - # self.stat_grid = np.zeros((ntarget, ngrid)) - # for j in range(ntarget): - # self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - # observed_target[j] + 1.5 * _scale[j], - # num=ngrid) - - - # self.useIP = useIP - - - # def summary(self, - # alternatives=None, - # parameter=None, - # level=0.9): - # """ - # Produce p-values and confidence intervals for targets - # of model including selected features - # Parameters - # ---------- - # alternatives : [str], optional - # Sequence of strings describing the alternatives, - # should be values of ['twosided', 'less', 'greater'] - # parameter : np.array - # Hypothesized value for parameter -- defaults to 0. - # level : float - # Confidence level. - # """ - - # if parameter is not None: - # pivots = self._pivots(parameter, - # alternatives=alternatives) - # else: - # pivots = None - - # pvalues = self._pivots(np.zeros_like(self.observed_target), - # alternatives=alternatives) - # lower, upper = self._intervals(level=level) - - # result = pd.DataFrame({'target': self.observed_target, - # 'pvalue': pvalues, - # 'alternative': alternatives, - # 'lower_confidence': lower, - # 'upper_confidence': upper}) - - # if not np.all(parameter == 0): - # result.insert(4, 'pivot', pivots) - # result.insert(5, 'parameter', parameter) - - # return result - def _approx_log_reference(self, observed_target, cov_target, @@ -157,6 +58,11 @@ def _approx_log_reference(self, """ Approximate the log of the reference density on a grid. """ + + TS = self.target_spec + QS = self.query_spec + cond_precision = np.linalg.inv(QS.cond_cov) + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') @@ -170,30 +76,33 @@ def _approx_log_reference(self, # cond_mean is "something" times D # Gamma is cov_target_score.T.dot(prec_target) - cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) - conjugate_arg = self.cond_precision.dot(cond_mean_grid) + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + QS.cond_mean) + conjugate_arg = cond_precision.dot(cond_mean_grid) val, _, _ = solver(conjugate_arg, - self.cond_precision, - self.observed_soln, - self.linear_part, - self.offset, + cond_precision, + QS.observed_soln, + QS.linear_part, + QS.offset, **self.solve_args) - ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) + ref_hat.append(-val - (conjugate_arg.T.dot(QS.cond_cov).dot(conjugate_arg) / 2.)) return np.asarray(ref_hat) def _construct_families(self): + TS = self.target_spec + QS = self.query_spec + self._construct_density() self._families = [] _log_ref = np.zeros((self.ntarget, 1000)) for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + observed_target_uni = (TS.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) var_target = 1. / ((self.precs[m])[0, 0]) @@ -204,7 +113,7 @@ def _construct_families(self): if self.useIP == False: - logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - TS.observed_target[m]) ** 2 / var_target) logW -= logW.max() _log_ref[m,:] = logW self._families.append(discrete_family(self.stat_grid[m], @@ -228,101 +137,3 @@ def _construct_families(self): self._log_ref = _log_ref - # def _pivots(self, - # mean_parameter, - # alternatives=None): - - # if not hasattr(self, "_families"): - # self._construct_families() - - # if alternatives is None: - # alternatives = ['twosided'] * self.ntarget - - # pivot = [] - - # for m in range(self.ntarget): - - # family = self._families[m] - # var_target = 1. / ((self.precs[m])[0, 0]) - - # mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] - # # construction of pivot from families follows `selectinf.learning.core` - - # _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) - - # if alternatives[m] == 'twosided': - # pivot.append(2 * min(_cdf, 1 - _cdf)) - # elif alternatives[m] == 'greater': - # pivot.append(1 - _cdf) - # elif alternatives[m] == 'less': - # pivot.append(_cdf) - # else: - # raise ValueError('alternative should be in ["twosided", "less", "greater"]') - # return pivot # , self._log_ref - - # def _intervals(self, - # level=0.9): - - # if not hasattr(self, "_families"): - # self._construct_families() - - # lower, upper = [], [] - - # for m in range(self.ntarget): - # # construction of intervals from families follows `selectinf.learning.core` - # family = self._families[m] - # observed_target = self.observed_target[m] - - # l, u = family.equal_tailed_interval(observed_target, - # alpha=1 - level) - - # var_target = 1. / ((self.precs[m])[0, 0]) - - # lower.append(l * var_target + observed_target) - # upper.append(u * var_target + observed_target) - - # return np.asarray(lower), np.asarray(upper) - - # ### Private method - # def _construct_density(self): - - # precs = {} - # S = {} - # r = {} - # T = {} - - # p = self.regress_target_score.shape[1] - - # for m in range(self.ntarget): - # observed_target_uni = (self.observed_target[m]).reshape((1,)) - # cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - # prec_target = 1. / cov_target_uni - # regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) - - # T1 = regress_target_score_uni.T.dot(prec_target) - # T2 = T1.T.dot(self.M2.dot(T1)) - # T3 = T1.T.dot(self.M3.dot(T1)) - # T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - # T5 = T1.T.dot(self.M1.dot(self.opt_linear)) - - # _T = self.cond_cov.dot(T5.T) - - # prec_target_nosel = prec_target + T2 - T3 - - # _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) - - # bias_target = cov_target_uni.dot( - # T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - - # _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) - # _S = np.linalg.inv(prec_target_nosel).dot(prec_target) - - # S[m] = _S - # r[m] = _r - # precs[m] = prec_target_nosel - # T[m] = _T - - # self.precs = precs - # self.S = S - # self.r = r - # self.T = T diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 3f19eaeb2..66db3d955 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -29,49 +29,24 @@ def __init__(self, self.query_spec = query_spec self.target_spec = target_spec - query = query_spec - self.solve_args = solve_args - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - self.observed_target = observed_target - self.cov_target = cov_target - self.prec_target = np.linalg.inv(cov_target) - self.regress_target_score = regress_target_score - - self.cond_mean = query.cond_mean - self.cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(self.cond_cov) - self.opt_linear = query.opt_linear - - self.linear_part = query.linear_part - self.offset = query.offset - - self.M1 = query.M1 - self.M2 = query.M2 - self.M3 = query.M3 - self.observed_soln = query.observed_opt_state - - self.observed_score = query.observed_score_state + query.observed_subgrad - - G = mle_inference(query, + G = mle_inference(query_spec, target_spec, solve_args=solve_args) _, inverse_info, log_ref = G.solve_estimating_eqn() - self.ntarget = ntarget = cov_target.shape[0] + TS = target_spec + self.ntarget = ntarget = TS.cov_target.shape[0] _scale = 4 * np.sqrt(np.diag(inverse_info)) self.inverse_info = inverse_info ngrid = 1000 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], + self.stat_grid[j, :] = np.linspace(TS.observed_target[j] - 1.5 * _scale[j], + TS.observed_target[j] + 1.5 * _scale[j], num=ngrid) def summary(self, @@ -92,17 +67,19 @@ def summary(self, Confidence level. """ + TS = self.target_spec + if parameter is not None: pivots = self._pivots(parameter, alternatives=alternatives) else: pivots = None - pvalues = self._pivots(np.zeros_like(self.observed_target), + pvalues = self._pivots(np.zeros_like(TS.observed_target), alternatives=alternatives) lower, upper = self._intervals(level=level) - result = pd.DataFrame({'target': self.observed_target, + result = pd.DataFrame({'target': TS.observed_target, 'pvalue': pvalues, 'alternative': alternatives, 'lower_confidence': lower, @@ -198,6 +175,8 @@ def _pivots(self, mean_parameter, alternatives=None): + TS = self.target_spec + if not hasattr(self, "_families"): self._construct_families() @@ -214,7 +193,7 @@ def _pivots(self, mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] # construction of pivot from families follows `selectinf.learning.core` - _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) + _cdf = family.cdf((mean[0] - TS.observed_target[m]) / var_target, x=TS.observed_target[m]) if alternatives[m] == 'twosided': pivot.append(2 * min(_cdf, 1 - _cdf)) @@ -229,6 +208,8 @@ def _pivots(self, def _intervals(self, level=0.9): + TS = self.target_spec + if not hasattr(self, "_families"): self._construct_families() @@ -237,7 +218,7 @@ def _intervals(self, for m in range(self.ntarget): # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] - observed_target = self.observed_target[m] + observed_target = TS.observed_target[m] l, u = family.equal_tailed_interval(observed_target, alpha=1 - level) @@ -252,33 +233,36 @@ def _intervals(self, ### Private method def _construct_density(self): + TS = self.target_spec + QS = self.query_spec + precs = {} S = {} r = {} T = {} - p = self.regress_target_score.shape[1] + p = TS.regress_target_score.shape[1] for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + observed_target_uni = (TS.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) prec_target = 1. / cov_target_uni - regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) + regress_target_score_uni = TS.regress_target_score[m, :].reshape((1, p)) T1 = regress_target_score_uni.T.dot(prec_target) - T2 = T1.T.dot(self.M2.dot(T1)) - T3 = T1.T.dot(self.M3.dot(T1)) - T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + T2 = T1.T.dot(QS.M2.dot(T1)) + T3 = T1.T.dot(QS.M3.dot(T1)) + T4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(T1))) + T5 = T1.T.dot(QS.M1.dot(QS.opt_linear)) - _T = self.cond_cov.dot(T5.T) + _T = QS.cond_cov.dot(T5.T) prec_target_nosel = prec_target + T2 - T3 - _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) + _P = -(T1.T.dot(QS.M1.dot(QS.observed_score)) + T2.dot(observed_target_uni)) bias_target = cov_target_uni.dot( - T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + T1.T.dot(-T4.dot(observed_target_uni) + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) _S = np.linalg.inv(prec_target_nosel).dot(prec_target) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index ccc76e37f..d37a99b3e 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -9,121 +9,24 @@ class exact_grid_inference(grid_inference): - def __init__(self, - query, - target_spec, - solve_args={'tol': 1.e-12}): - - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - query : `gaussian_query` - A Gaussian query which has information - to describe implied Gaussian. - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - cov_target_score : ndarray - Estimated covariance of target and score of randomized query. - """ - - - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - self.observed_target = observed_target - self.cov_target = cov_target - self.prec_target = np.linalg.inv(cov_target) - self.regress_target_score = regress_target_score - - self.cond_mean = query.cond_mean - self.cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(self.cond_cov) - self.opt_linear = query.opt_linear - - self.linear_part = query.linear_part - self.offset = query.offset - - self.M1 = query.M1 - self.M2 = query.M2 - self.M3 = query.M3 - self.observed_soln = query.observed_opt_state - - self.observed_score = query.observed_score_state + query.observed_subgrad - - G = mle_inference(query, - target_spec, - solve_args=solve_args) - - _, inverse_info, log_ref = G.solve_estimating_eqn() - - self.ntarget = ntarget = cov_target.shape[0] - _scale = 4 * np.sqrt(np.diag(inverse_info)) - - ngrid = 1000 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) - - self.inverse_info = inverse_info - - def summary(self, - alternatives=None, - parameter=None, - level=0.90): - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - level : float - Confidence level. - """ - - if parameter is not None: - pivots = self._pivots(parameter, - alternatives=alternatives) - else: - pivots = None - - pvalues = self._pivots(np.zeros_like(self.observed_target), - alternatives=alternatives) - lower, upper = self._intervals(level=level) - - result = pd.DataFrame({'target': self.observed_target, - 'pvalue': pvalues, - 'alternative': alternatives, - 'lower_confidence': lower, - 'upper_confidence': upper}) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result - def log_reference(self, observed_target, cov_target, linear_coef, grid): + QS = self.query_spec + TS = self.target_spec + if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') ref_hat = [] + cond_precision = np.linalg.inv(QS.cond_cov) + num_opt = cond_precision.shape[0] + num_con = QS.linear_part.shape[0] + for k in range(grid.shape[0]): # in the usual D = N + Gamma theta.hat, # regress_opt_target is "something" times Gamma, @@ -131,27 +34,24 @@ def log_reference(self, # cond_mean is "something" times D # Gamma is cov_target_score.T.dot(prec_target) - num_opt = self.cond_precision.shape[0] - num_con = self.linear_part.shape[0] - cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + - self.cond_mean) + QS.cond_mean) #direction for decomposing o - eta = self.cond_precision.dot(linear_coef).dot(cov_target) + eta = cond_precision.dot(linear_coef).dot(cov_target) implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) - implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) + implied_cov = np.asscalar(eta.T.dot(QS.cond_cov).dot(eta)) implied_prec = 1./implied_cov - _A = self.cond_cov.dot(eta) * implied_prec + _A = QS.cond_cov.dot(eta) * implied_prec R = np.identity(num_opt) - _A.dot(eta.T) - A = self.linear_part.dot(_A).reshape((-1,)) - b = -self.linear_part.dot(R).dot(self.observed_soln) + A = QS.linear_part.dot(_A).reshape((-1,)) + b = -QS.linear_part.dot(R).dot(QS.observed_soln) - trunc_ = np.true_divide((self.offset + b), A) + trunc_ = np.true_divide((QS.offset + b), A) neg_indx = np.asarray([j for j in range(num_con) if A[j] < 0.]) pos_indx = np.asarray([j for j in range(num_con) if A[j] > 0.]) @@ -186,14 +86,17 @@ def log_reference(self, def _construct_families(self): + QS = self.query_spec + TS = self.target_spec + self._construct_density() self._families = [] for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) + observed_target_uni = (TS.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) var_target = 1. / ((self.precs[m])[0, 0]) @@ -202,108 +105,11 @@ def _construct_families(self): self.T[m], self.stat_grid[m]) - logW = (log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) + logW = (log_ref - 0.5 * (self.stat_grid[m] - TS.observed_target[m]) ** 2 / var_target) logW -= logW.max() self._families.append(discrete_family(self.stat_grid[m], np.exp(logW))) - def _pivots(self, - mean_parameter, - alternatives=None): - - if not hasattr(self, "_families"): - self._construct_families() - - if alternatives is None: - alternatives = ['twosided'] * self.ntarget - - pivot = [] - - for m in range(self.ntarget): - - family = self._families[m] - var_target = 1. / ((self.precs[m])[0, 0]) - - mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] - - _cdf = family.cdf((mean[0] - self.observed_target[m]) / var_target, x=self.observed_target[m]) - - if alternatives[m] == 'twosided': - pivot.append(2 * min(_cdf, 1 - _cdf)) - elif alternatives[m] == 'greater': - pivot.append(1 - _cdf) - elif alternatives[m] == 'less': - pivot.append(_cdf) - else: - raise ValueError('alternative should be in ["twosided", "less", "greater"]') - return pivot - - def _intervals(self, - level=0.9): - - if not hasattr(self, "_families"): - self._construct_families() - - lower, upper = [], [] - - for m in range(self.ntarget): - # construction of intervals from families follows `selectinf.learning.core` - family = self._families[m] - observed_target = self.observed_target[m] - - l, u = family.equal_tailed_interval(observed_target, - alpha=1 - level) - - var_target = 1. / ((self.precs[m])[0, 0]) - - lower.append(l * var_target + observed_target) - upper.append(u * var_target + observed_target) - - return np.asarray(lower), np.asarray(upper) - - ### Private method - def _construct_density(self): - - precs = {} - S = {} - r = {} - T = {} - - p = self.regress_target_score.shape[1] - - for m in range(self.ntarget): - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - prec_target = 1. / cov_target_uni - regress_target_score_uni = self.regress_target_score[m, :].reshape((1, p)) - - T1 = regress_target_score_uni.T.dot(prec_target) - T2 = T1.T.dot(self.M2.dot(T1)) - T3 = T1.T.dot(self.M3.dot(T1)) - T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - T5 = T1.T.dot(self.M1.dot(self.opt_linear)) - - _T = self.cond_cov.dot(T5.T) - - prec_target_nosel = prec_target + T2 - T3 - - _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(observed_target_uni)) - - bias_target = cov_target_uni.dot(T1.T.dot(-T4.dot(observed_target_uni) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) - - _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) - _S = np.linalg.inv(prec_target_nosel).dot(prec_target) - - S[m] = _S - r[m] = _r - precs[m] = prec_target_nosel - T[m] = _T - - self.precs = precs - self.S = S - self.r = r - self.T = T - diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 9fc302b25..76bd8907b 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -24,11 +24,11 @@ def solve_estimating_eqn(self, prec_target_nosel, bias_target, U3, U5 = _setup_estimating_eqn(self.query_spec, self.target_spec) - Q = self.query_spec + QS = self.query_spec TS = self.target_spec - cond_precision = np.linalg.inv(Q.cond_cov) - conjugate_arg = cond_precision.dot(Q.cond_mean) + cond_precision = np.linalg.inv(QS.cond_cov) + conjugate_arg = cond_precision.dot(QS.cond_mean) if useC: solver = solve_barrier_affine_C @@ -37,13 +37,13 @@ def solve_estimating_eqn(self, val, soln, hess = solver(conjugate_arg, cond_precision, - Q.observed_soln, - Q.linear_part, - Q.offset, + QS.observed_soln, + QS.linear_part, + QS.offset, **self.solve_args) final_estimator = TS.cov_target.dot(prec_target_nosel).dot(TS.observed_target) \ - + TS.regress_target_score.dot(Q.M1.dot(Q.opt_linear)).dot(Q.cond_mean - soln) \ + + TS.regress_target_score.dot(QS.M1.dot(QS.opt_linear)).dot(QS.cond_mean - soln) \ - bias_target observed_info_natural = prec_target_nosel + U3 - U5.dot(hess.dot(U5.T)) @@ -77,7 +77,7 @@ def solve_estimating_eqn(self, intervals = np.vstack([final_estimator - quantile * np.sqrt(np.diag(observed_info_mean)), final_estimator + quantile * np.sqrt(np.diag(observed_info_mean))]).T - log_ref = val + conjugate_arg.T.dot(Q.cond_cov).dot(conjugate_arg) / 2. + log_ref = val + conjugate_arg.T.dot(QS.cond_cov).dot(conjugate_arg) / 2. result = pd.DataFrame({'MLE': final_estimator, 'SE': np.sqrt(np.diag(observed_info_mean)), @@ -93,22 +93,22 @@ def solve_estimating_eqn(self, def _setup_estimating_eqn(query_spec, target_spec): - Q = query_spec + QS = query_spec TS = target_spec prec_target = np.linalg.inv(TS.cov_target) U1 = TS.regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(Q.M2.dot(U1)) - U3 = U1.T.dot(Q.M3.dot(U1)) - U4 = Q.M1.dot(Q.opt_linear).dot(Q.cond_cov).dot(Q.opt_linear.T.dot(Q.M1.T.dot(U1))) - U5 = U1.T.dot(Q.M1.dot(Q.opt_linear)) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) prec_target_nosel = prec_target + U2 - U3 - _P = -(U1.T.dot(Q.M1.dot(Q.observed_score)) + U2.dot(TS.observed_target)) + _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) - + Q.M1.dot(Q.opt_linear.dot(Q.cond_mean))) - _P) + + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) return prec_target_nosel, bias_target, U3, U5 From aea5df80cb5c8d00db6ae4f5de6892cf0470a8b7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 12:42:36 -0800 Subject: [PATCH 169/187] renaming temporary matrices --- selectinf/randomized/base.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 66db3d955..12f3b7bbf 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -249,20 +249,20 @@ def _construct_density(self): prec_target = 1. / cov_target_uni regress_target_score_uni = TS.regress_target_score[m, :].reshape((1, p)) - T1 = regress_target_score_uni.T.dot(prec_target) - T2 = T1.T.dot(QS.M2.dot(T1)) - T3 = T1.T.dot(QS.M3.dot(T1)) - T4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(T1))) - T5 = T1.T.dot(QS.M1.dot(QS.opt_linear)) + U1 = regress_target_score_uni.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) - _T = QS.cond_cov.dot(T5.T) + _T = QS.cond_cov.dot(U5.T) - prec_target_nosel = prec_target + T2 - T3 + prec_target_nosel = prec_target + U2 - U3 - _P = -(T1.T.dot(QS.M1.dot(QS.observed_score)) + T2.dot(observed_target_uni)) + _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(observed_target_uni)) bias_target = cov_target_uni.dot( - T1.T.dot(-T4.dot(observed_target_uni) + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) + U1.T.dot(-U4.dot(observed_target_uni) + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) _S = np.linalg.inv(prec_target_nosel).dot(prec_target) From 8fffbb203f455c9c73e10348fc8412d53741d505 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 17 Nov 2021 22:09:35 -0800 Subject: [PATCH 170/187] more cleanup; remains to unify U1,U2,U3... calculation across 4 methods --- selectinf/base.py | 5 +- selectinf/randomized/approx_reference.py | 6 +- selectinf/randomized/base.py | 111 ++++++++--------- selectinf/randomized/exact_reference.py | 6 +- selectinf/randomized/posterior_inference.py | 110 +++++++++-------- selectinf/randomized/query.py | 125 ++++++++++---------- 6 files changed, 173 insertions(+), 190 deletions(-) diff --git a/selectinf/base.py b/selectinf/base.py index 3c8100cf5..51c09ba85 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -1,4 +1,4 @@ -import typing +from typing import NamedTuple import numpy as np @@ -47,13 +47,12 @@ def restricted_estimator(loss, active, solve_args={'min_its':50, 'tol':1.e-10}): # functions construct targets of inference # and covariance with score representation -class TargetSpec(typing.NamedTuple): +class TargetSpec(NamedTuple): observed_target : np.ndarray cov_target : np.ndarray regress_target_score : np.ndarray alternatives : list - #dispersion : float = 1 def selected_targets(loglike, solution, diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index e7d2df42f..0e70b80d0 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -95,7 +95,7 @@ def _construct_families(self): TS = self.target_spec QS = self.query_spec - self._construct_density() + precs, S, r, T = self.conditional_spec self._families = [] _log_ref = np.zeros((self.ntarget, 1000)) @@ -104,11 +104,11 @@ def _construct_families(self): observed_target_uni = (TS.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) - var_target = 1. / ((self.precs[m])[0, 0]) + var_target = 1. / (precs[m][0, 0]) approx_log_ref = self._approx_log_reference(observed_target_uni, cov_target_uni, - self.T[m], + T[m], self.stat_grid[m]) if self.useIP == False: diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 12f3b7bbf..b3aa8332b 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -1,7 +1,22 @@ +from typing import NamedTuple import numpy as np, pandas as pd from .selective_MLE import mle_inference +class ConditionalSpec(NamedTuple): + + # description of (preselection) conditional law of + # targets \hat{\theta} | u, N + # if they were unbiased, then: + # 1) precision will agree with marginal variance + # 2) scalings will all be 1 + # 3) shifts will be 0 + + precision : np.ndarray + scalings : np.ndarray + shifts : np.ndarray + T : np.ndarray # what is T? + class grid_inference(object): def __init__(self, @@ -127,58 +142,16 @@ def _approx_log_reference(self, return np.asarray(ref_hat) - def _construct_families(self): - - self._construct_density() - - self._families = [] - _log_ref = np.zeros((self.ntarget, 1000)) - for m in range(self.ntarget): - - observed_target_uni = (self.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(self.cov_target)[m]).reshape((1, 1)) - - var_target = 1. / ((self.precs[m])[0, 0]) - - approx_log_ref = self._approx_log_reference(observed_target_uni, - cov_target_uni, - self.T[m], - self.stat_grid[m]) - - if self.useIP == False: - - logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - self.observed_target[m]) ** 2 / var_target) - logW -= logW.max() - _log_ref[m,:] = logW - self._families.append(discrete_family(self.stat_grid[m], - np.exp(logW))) - else: - - approx_fn = interp1d(self.stat_grid[m], - approx_log_ref, - kind='quadratic', - bounds_error=False, - fill_value='extrapolate') - - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) - logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) - - logW -= logW.max() - _log_ref[m, :] = logW - self._families.append(discrete_family(grid, - np.exp(logW))) - - self._log_ref = _log_ref - def _pivots(self, mean_parameter, alternatives=None): TS = self.target_spec - + if not hasattr(self, "_families"): - self._construct_families() + self._construct_density() # generic + self._construct_families() # specific to the method + precs, S, r = self.conditional_spec if alternatives is None: alternatives = ['twosided'] * self.ntarget @@ -188,9 +161,9 @@ def _pivots(self, for m in range(self.ntarget): family = self._families[m] - var_target = 1. / ((self.precs[m])[0, 0]) + var_target = 1. / (precs[m][0, 0]) - mean = self.S[m].dot(mean_parameter[m].reshape((1,))) + self.r[m] + mean = S[m].dot(mean_parameter[m].reshape((1,))) + r[m] # construction of pivot from families follows `selectinf.learning.core` _cdf = family.cdf((mean[0] - TS.observed_target[m]) / var_target, x=TS.observed_target[m]) @@ -211,7 +184,10 @@ def _intervals(self, TS = self.target_spec if not hasattr(self, "_families"): - self._construct_families() + self._construct_density() # generic + self._construct_families() # specific to the method + + precs, S, r, _ = self.conditional_spec lower, upper = [], [] @@ -223,7 +199,9 @@ def _intervals(self, l, u = family.equal_tailed_interval(observed_target, alpha=1 - level) - var_target = 1. / ((self.precs[m])[0, 0]) + var_target = 1. / (precs[m][0, 0]) + + # JT: I think these should cover S \theta^* + r not theta^* lower.append(l * var_target + observed_target) upper.append(u * var_target + observed_target) @@ -231,15 +209,19 @@ def _intervals(self, return np.asarray(lower), np.asarray(upper) ### Private method + def _construct_density(self): + """ + What is this method doing? + """ TS = self.target_spec QS = self.query_spec - precs = {} - S = {} - r = {} - T = {} + precs = [] + S = [] + r = [] + T = [] p = TS.regress_target_score.shape[1] @@ -255,6 +237,7 @@ def _construct_density(self): U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + # JT: what is _T? _T = QS.cond_cov.dot(U5.T) prec_target_nosel = prec_target + U2 - U3 @@ -267,12 +250,16 @@ def _construct_density(self): _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) _S = np.linalg.inv(prec_target_nosel).dot(prec_target) - S[m] = _S - r[m] = _r - precs[m] = prec_target_nosel - T[m] = _T + S.append(_S) + r.append(_r) + precs.append(prec_target_nosel) + T.append(_T) + + self.conditional_spec = ConditionalSpec(np.array(precs), + np.array(S), + np.array(r), + np.array(T) # what is T here? + ) + + return self.conditional_spec - self.precs = precs - self.S = S - self.r = r - self.T = T diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index d37a99b3e..8a3b51b6e 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -89,7 +89,7 @@ def _construct_families(self): QS = self.query_spec TS = self.target_spec - self._construct_density() + precs, S, r, T = self.conditional_spec self._families = [] @@ -98,11 +98,11 @@ def _construct_families(self): observed_target_uni = (TS.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) - var_target = 1. / ((self.precs[m])[0, 0]) + var_target = 1. / (precs[m][0, 0]) log_ref = self.log_reference(observed_target_uni, cov_target_uni, - self.T[m], + T[m], self.stat_grid[m]) logW = (log_ref - 0.5 * (self.stat_grid[m] - TS.observed_target[m]) ** 2 / var_target) diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 0c33f3b96..7ab09195b 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -35,47 +35,24 @@ class posterior(object): """ def __init__(self, - query, + query_spec, target_spec, dispersion, prior, solve_args={'tol': 1.e-12}): + self.query_spec = QS = query_spec + self.target_spec = TS = target_spec self.solve_args = solve_args - (observed_target, - cov_target, - regress_target_score) = target_spec[:3] - - self.observed_target = observed_target - self.cov_target = cov_target - self.prec_target = np.linalg.inv(cov_target) - self.regress_target_score = regress_target_score - - self.cond_mean = query.cond_mean - self.cond_cov = query.cond_cov - self.cond_precision = np.linalg.inv(self.cond_cov) - self.opt_linear = query.opt_linear - - self.linear_part = query.linear_part - self.offset = query.offset - - self.M1 = query.M1 - self.M2 = query.M2 - self.M3 = query.M3 - self.observed_soln = query.observed_opt_state - - self.observed_score = query.observed_score_state + query.observed_subgrad - - G = mle_inference(query, + G = mle_inference(query_spec, target_spec, solve_args=solve_args) result, self.inverse_info, self.log_ref = G.solve_estimating_eqn() - self.ntarget = self.cov_target.shape[0] - self.nopt = self.cond_precision.shape[0] - + self.ntarget = TS.cov_target.shape[0] + self.nopt = QS.cond_cov.shape[0] self.initial_estimate = np.asarray(result['MLE']) self.dispersion = dispersion @@ -83,7 +60,7 @@ def __init__(self, ### Note for an informative prior we might want to change this... self.prior = prior - self._set_marginal_parameters() + self._get_marginal_parameters() def log_posterior(self, target_parameter, @@ -99,30 +76,39 @@ def log_posterior(self, Noise standard deviation. """ + QS = self.query_spec + TS = self.target_spec + + (prec_marginal, + linear_coef, + offset_coef, + r, + S, + prec_target_nosel) = self._get_marginal_parameters() + sigmasq = sigma ** 2 - target = self.S.dot(target_parameter) + self.r + target = S.dot(target_parameter) + r - mean_marginal = self.linear_coef.dot(target) + self.offset_coef - prec_marginal = self.prec_marginal + mean_marginal = linear_coef.dot(target) + offset_coef conjugate_marginal = prec_marginal.dot(mean_marginal) solver = solve_barrier_affine_py val, soln, hess = solver(conjugate_marginal, prec_marginal, - self.observed_soln, - self.linear_part, - self.offset, + QS.observed_soln, + QS.linear_part, + QS.offset, **self.solve_args) log_normalizer = -val - mean_marginal.T.dot(prec_marginal).dot(mean_marginal) / 2. - log_lik = -((self.observed_target - target).T.dot(self.prec_target_nosel).dot(self.observed_target - target)) / 2. \ + log_lik = -((TS.observed_target - target).T.dot(prec_target_nosel).dot(TS.observed_target - target)) / 2. \ - log_normalizer - grad_lik = self.S.T.dot(self.prec_target_nosel.dot(self.observed_target) - self.prec_target_nosel.dot(target) - - self.linear_coef.T.dot(prec_marginal.dot(soln) - conjugate_marginal)) + grad_lik = S.T.dot(prec_target_nosel.dot(TS.observed_target) - prec_target_nosel.dot(target) + - linear_coef.T.dot(prec_marginal.dot(soln) - conjugate_marginal)) log_prior, grad_prior = self.prior(target_parameter) @@ -134,7 +120,7 @@ def log_posterior(self, ### Private method - def _set_marginal_parameters(self): + def _get_marginal_parameters(self): """ This works out the implied covariance of optimization varibles as a function @@ -142,33 +128,43 @@ def _set_marginal_parameters(self): implied mean as a function of the true parameters. """ - T1 = self.regress_target_score.T.dot(self.prec_target) - T2 = T1.T.dot(self.M2.dot(T1)) - T3 = T1.T.dot(self.M3.dot(T1)) - T4 = self.M1.dot(self.opt_linear).dot(self.cond_cov).dot(self.opt_linear.T.dot(self.M1.T.dot(T1))) - T5 = T1.T.dot(self.M1.dot(self.opt_linear)) + QS = self.query_spec + TS = self.target_spec + + prec_target = np.linalg.inv(TS.cov_target) + cond_precision = np.linalg.inv(QS.cond_cov) + + U1 = TS.regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) - prec_target_nosel = self.prec_target + T2 - T3 + prec_target_nosel = prec_target + U2 - U3 - _P = -(T1.T.dot(self.M1.dot(self.observed_score)) + T2.dot(self.observed_target)) + _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) - bias_target = self.cov_target.dot(T1.T.dot(-T4.dot(self.observed_target) + self.M1.dot(self.opt_linear.dot(self.cond_mean))) - _P) + bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) + + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) ###set parameters for the marginal distribution of optimization variables - _Q = np.linalg.inv(prec_target_nosel + T3) - self.prec_marginal = self.cond_precision - T5.T.dot(_Q).dot(T5) - self.linear_coef = self.cond_cov.dot(T5.T) - self.offset_coef = self.cond_mean - self.linear_coef.dot(self.observed_target) + _Q = np.linalg.inv(prec_target_nosel + U3) + prec_marginal = cond_precision - U5.T.dot(_Q).dot(U5) + linear_coef = QS.cond_cov.dot(U5.T) + offset_coef = QS.cond_mean - linear_coef.dot(TS.observed_target) ###set parameters for the marginal distribution of target - r = np.linalg.inv(prec_target_nosel).dot(self.prec_target.dot(bias_target)) - S = np.linalg.inv(prec_target_nosel).dot(self.prec_target) + r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + S = np.linalg.inv(prec_target_nosel).dot(prec_target) - self.r = r - self.S = S - self.prec_target_nosel = prec_target_nosel + return (prec_marginal, + linear_coef, + offset_coef, + r, + S, + prec_target_nosel) ### sampling methods diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index bd3a1bcd1..610d177ff 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -37,7 +37,7 @@ class QuerySpec(NamedTuple): observed_soln : np.ndarray observed_score : np.ndarray -class query(object): +class gaussian_query(object): r""" This class is the base of randomized selective inference based on convex programs. @@ -112,19 +112,19 @@ def set_sampler(self, sampler): sampler = property(get_sampler, set_sampler, doc='Sampler of optimization (augmented) variables.') - # implemented by subclasses + # # implemented by subclasses - def solve(self): + # def solve(self): - raise NotImplementedError('abstract method') + # raise NotImplementedError('abstract method') -class gaussian_query(query): +# class gaussian_query(query): - """ - A class with Gaussian perturbation to the objective -- - easy to apply CLT to such things - """ +# """ +# A class with Gaussian perturbation to the objective -- +# easy to apply CLT to such things +# """ def fit(self, perturb=None): @@ -259,64 +259,65 @@ def inference(self, level=level) elif method == 'posterior': - return self.posterior(target_spec, - **method_args)[1] + return _posterior(query_spec, + target_spec, + **method_args)[1] - def posterior(self, - target_spec, - level=0.90, - dispersion=1, - prior=None, - solve_args={'tol': 1.e-12}, - nsample=2000, - nburnin=500): - """ +def _posterior(query_spec, + target_spec, + level=0.90, + dispersion=1, + prior=None, + solve_args={'tol': 1.e-12}, + nsample=2000, + nburnin=500): + """ - Parameters - ---------- - target_spec : TargetSpec - Information needed to specify the target. - level : float - Level for credible interval. - dispersion : float, optional - Dispersion parameter for log-likelihood. - prior : callable - A callable object that takes a single argument - `parameter` of the same shape as `observed_target` - and returns (value of log prior, gradient of log prior) - solve_args : dict, optional - Arguments passed to solver. + Parameters + ---------- + target_spec : TargetSpec + Information needed to specify the target. + level : float + Level for credible interval. + dispersion : float, optional + Dispersion parameter for log-likelihood. + prior : callable + A callable object that takes a single argument + `parameter` of the same shape as `observed_target` + and returns (value of log prior, gradient of log prior) + solve_args : dict, optional + Arguments passed to solver. - """ + """ + + if prior is None: + Di = 1. / (200 * np.diag(target_spec.cov_target)) + + def prior(target_parameter): + grad_prior = -target_parameter * Di + log_prior = -0.5 * np.sum(target_parameter ** 2 * Di) + return log_prior, grad_prior + + posterior_repr = posterior(query_spec, + target_spec, + dispersion, + prior, + solve_args=solve_args) + + samples = langevin_sampler(posterior_repr, + nsample=nsample, + nburnin=nburnin) + + delta = 0.5 * (1 - level) * 100 + lower = np.percentile(samples, delta, axis=0) + upper = np.percentile(samples, 100 - delta, axis=0) + mean = np.mean(samples, axis=0) + + return samples, pd.DataFrame({'estimate':mean, + 'lower_credible':lower, + 'upper_credible':upper}) - if prior is None: - Di = 1. / (200 * np.diag(target_spec.cov_target)) - - def prior(target_parameter): - grad_prior = -target_parameter * Di - log_prior = -0.5 * np.sum(target_parameter ** 2 * Di) - return log_prior, grad_prior - - posterior_repr = posterior(self, - target_spec, - dispersion, - prior, - solve_args=solve_args) - - samples = langevin_sampler(posterior_repr, - nsample=nsample, - nburnin=nburnin) - - delta = 0.5 * (1 - level) * 100 - lower = np.percentile(samples, delta, axis=0) - upper = np.percentile(samples, 100 - delta, axis=0) - mean = np.mean(samples, axis=0) - - return samples, pd.DataFrame({'estimate':mean, - 'lower_credible':lower, - 'upper_credible':upper}) - From 53645b7a2281532197e18f8ea3ae184e932cec57 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Tue, 23 Nov 2021 07:45:28 -0500 Subject: [PATCH 171/187] adjusted cov in split_lasso; adjusted intervals for actual target in exact_ref --- selectinf/randomized/base.py | 13 +++++++++---- selectinf/randomized/exact_reference.py | 5 ++--- selectinf/randomized/lasso.py | 10 ++++++---- selectinf/randomized/query.py | 14 -------------- 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index b3aa8332b..16773807b 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -195,16 +195,21 @@ def _intervals(self, # construction of intervals from families follows `selectinf.learning.core` family = self._families[m] observed_target = TS.observed_target[m] + unbiased_est = (observed_target - r[m][0]) * (1./(S[m][0,0])) - l, u = family.equal_tailed_interval(observed_target, - alpha=1 - level) + _l, _u = family.equal_tailed_interval(observed_target, + alpha=1 - level) + l = _l * (1./(S[m][0,0])) + u = _u * (1./(S[m][0,0])) var_target = 1. / (precs[m][0, 0]) # JT: I think these should cover S \theta^* + r not theta^* - lower.append(l * var_target + observed_target) - upper.append(u * var_target + observed_target) + #lower.append(l * var_target + observed_target) + #upper.append(u * var_target + observed_target) + lower.append(l * var_target + unbiased_est) + upper.append(u * var_target + unbiased_est) return np.asarray(lower), np.asarray(upper) diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index 8a3b51b6e..dbc7711da 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -1,10 +1,9 @@ from __future__ import division, print_function -import numpy as np, pandas as pd +import numpy as np from scipy.stats import norm as ndist from ..distributions.discrete_family import discrete_family -from .selective_MLE import mle_inference from .base import grid_inference class exact_grid_inference(grid_inference): @@ -16,7 +15,7 @@ def log_reference(self, grid): QS = self.query_spec - TS = self.target_spec + TS = self.target_spec ## we don't use this; it seems that we have already formed the target_specific elements which we input as arguments for this functions if np.asarray(observed_target).shape in [(), (0,)]: raise ValueError('no target specified') diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 26fecf91e..6f71819d0 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -788,11 +788,13 @@ def _setup_implied_gaussian(self, regress_opt[:, ordered_vars] = -cond_cov * signs[None, :] / (dispersion * ratio) cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) - prod_score_prec = np.identity(self.nfeature) / ratio - - cov_rand = self._unscaled_cov_score * dispersion + ## probably missing a dispersion in the denominator + prod_score_prec_unnorm = np.identity(self.nfeature) / (dispersion * ratio) + + ## probably missing a multiplicative factor of ratio + cov_rand = self._unscaled_cov_score * (dispersion * ratio) - M1 = prod_score_prec * dispersion + M1 = prod_score_prec_unnorm * dispersion M2 = M1.dot(cov_rand).dot(M1.T) M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 610d177ff..f67ba3ec1 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -112,20 +112,6 @@ def set_sampler(self, sampler): sampler = property(get_sampler, set_sampler, doc='Sampler of optimization (augmented) variables.') - # # implemented by subclasses - - # def solve(self): - - # raise NotImplementedError('abstract method') - - -# class gaussian_query(query): - -# """ -# A class with Gaussian perturbation to the objective -- -# easy to apply CLT to such things -# """ - def fit(self, perturb=None): # take a new perturbation if supplied From dad81a2674c2e0c94111ec17399dab5181502d5a Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Thu, 25 Nov 2021 16:03:13 -0500 Subject: [PATCH 172/187] minor fix in return list --- selectinf/randomized/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 16773807b..9c05ca7b4 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -151,7 +151,7 @@ def _pivots(self, if not hasattr(self, "_families"): self._construct_density() # generic self._construct_families() # specific to the method - precs, S, r = self.conditional_spec + precs, S, r, _ = self.conditional_spec if alternatives is None: alternatives = ['twosided'] * self.ntarget From 40cd77b2d70e433dacc4755560407aafac051c25 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 29 Nov 2021 22:41:31 -0800 Subject: [PATCH 173/187] fixing approx_reference --- selectinf/randomized/approx_reference.py | 65 +++++++++++++++++------- selectinf/randomized/base.py | 9 ++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 0e70b80d0..dd27e98b3 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -1,3 +1,4 @@ + from __future__ import division, print_function import numpy as np, pandas as pd @@ -13,7 +14,8 @@ def __init__(self, query_spec, target_spec, solve_args={'tol': 1.e-12}, - useIP=False): + ngrid=1000, + ncoarse=40): """ Produce p-values and confidence intervals for targets @@ -38,16 +40,7 @@ def __init__(self, target_spec, solve_args=solve_args) - if useIP: - ngrid = 60 - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(observed_target[j] - 1.5 * _scale[j], - observed_target[j] + 1.5 * _scale[j], - num=ngrid) - - - self.useIP = useIP + self.ncoarse = ncoarse def _approx_log_reference(self, observed_target, @@ -98,7 +91,19 @@ def _construct_families(self): precs, S, r, T = self.conditional_spec self._families = [] - _log_ref = np.zeros((self.ntarget, 1000)) + + if self.ncoarse is not None: + coarse_grid = np.zeros((self.stat_grid.shape[0], self.ncoarse)) + for j in range(coarse_grid.shape[0]): + coarse_grid[j,:] = np.linspace(self.stat_grid[j].min(), + self.stat_grid[j].max(), + self.ncoarse) + eval_grid = coarse_grid + else: + eval_grid = self.stat_grid + + _log_ref = np.zeros((self.ntarget, self.stat_grid[0].shape[0])) + for m in range(self.ntarget): observed_target_uni = (TS.observed_target[m]).reshape((1,)) @@ -109,9 +114,9 @@ def _construct_families(self): approx_log_ref = self._approx_log_reference(observed_target_uni, cov_target_uni, T[m], - self.stat_grid[m]) - - if self.useIP == False: + eval_grid[m]) + + if self.ncoarse is None: logW = (approx_log_ref - 0.5 * (self.stat_grid[m] - TS.observed_target[m]) ** 2 / var_target) logW -= logW.max() @@ -120,17 +125,41 @@ def _construct_families(self): np.exp(logW))) else: - approx_fn = interp1d(self.stat_grid[m], + approx_fn = interp1d(eval_grid[m], approx_log_ref, kind='quadratic', bounds_error=False, fill_value='extrapolate') - grid = np.linspace(self.stat_grid[m].min(), self.stat_grid[m].max(), 1000) + grid = self.stat_grid[m] logW = (approx_fn(grid) - - 0.5 * (grid - self.observed_target[m]) ** 2 / var_target) + 0.5 * (grid - TS.observed_target[m]) ** 2 / var_target) logW -= logW.max() + + DEBUG = False # JT: this can be removed + if DEBUG: + approx_log_ref2 = self._approx_log_reference(observed_target_uni, + cov_target_uni, + T[m], + grid) + logW2 = (approx_log_ref2 - 0.5 * (grid - TS.observed_target[m]) ** 2 / var_target) + logW2 -= logW2.max() + import matplotlib.pyplot as plt + plt.plot(grid, logW, label='extrapolated') + + plt.plot(grid, logW2, label='fine grid') + plt.legend() + + plt.figure(num=2) + plt.plot(eval_grid[m], approx_fn(eval_grid[m]), label='extrapolated coarse') + plt.plot(grid, approx_fn(grid), label='extrapolated fine') + plt.plot(grid, approx_log_ref2, label='fine grid') + plt.legend() + + plt.show() + stop + _log_ref[m, :] = logW self._families.append(discrete_family(grid, np.exp(logW))) diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 9c05ca7b4..279e94dd7 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -22,7 +22,8 @@ class grid_inference(object): def __init__(self, query_spec, target_spec, - solve_args={'tol': 1.e-12}): + solve_args={'tol': 1.e-12}, + ngrid=1000): """ Produce p-values and confidence intervals for targets @@ -45,6 +46,7 @@ def __init__(self, self.query_spec = query_spec self.target_spec = target_spec self.solve_args = solve_args + self.ngrid = ngrid G = mle_inference(query_spec, target_spec, @@ -57,7 +59,6 @@ def __init__(self, _scale = 4 * np.sqrt(np.diag(inverse_info)) self.inverse_info = inverse_info - ngrid = 1000 self.stat_grid = np.zeros((ntarget, ngrid)) for j in range(ntarget): self.stat_grid[j, :] = np.linspace(TS.observed_target[j] - 1.5 * _scale[j], @@ -204,10 +205,6 @@ def _intervals(self, var_target = 1. / (precs[m][0, 0]) - # JT: I think these should cover S \theta^* + r not theta^* - - #lower.append(l * var_target + observed_target) - #upper.append(u * var_target + observed_target) lower.append(l * var_target + unbiased_est) upper.append(u * var_target + unbiased_est) From 1508e06e57815c5add6f6ffe25149ea111dfc14d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 6 Dec 2021 22:38:33 -0800 Subject: [PATCH 174/187] refactor SLOPE to new form; added split_slope --- selectinf/randomized/slope.py | 422 +++++++++++++++++++++++++++++++++- 1 file changed, 412 insertions(+), 10 deletions(-) diff --git a/selectinf/randomized/slope.py b/selectinf/randomized/slope.py index b7ede0954..3015c8259 100644 --- a/selectinf/randomized/slope.py +++ b/selectinf/randomized/slope.py @@ -104,10 +104,10 @@ def fit(self, _active_signs = active_signs.copy() self.selection_variable = {'sign': _active_signs, - 'variables': self._overall} + 'variables': np.nonzero(self._overall)[0]} - indices = np.argsort(-np.fabs(self.observed_soln)) + indices = self.selection_variable['indices'] = np.argsort(-np.fabs(self.observed_soln)) sorted_soln = self.observed_soln[indices] initial_scalings = np.sort(np.unique(np.fabs(self.observed_soln[active])))[::-1] self.observed_opt_state = initial_scalings @@ -150,6 +150,7 @@ def fit(self, break signs_cluster = np.asarray(signs_cluster).T + self.selection_variable['signs_cluster'] = signs_cluster if signs_cluster.size == 0: return active_signs @@ -158,8 +159,6 @@ def fit(self, X_clustered = X[:, indices].dot(signs_cluster) _opt_linear_term = X.T.dot(X_clustered) - _, prec = self.randomizer.cov_prec - # now make the constraints self._setup = True @@ -170,13 +169,21 @@ def fit(self, A_scaling = np.vstack([A_scaling_0, A_scaling_1]) b_scaling = np.zeros(2 * self.num_opt_var - 1) - self._setup_sampler(A_scaling, - b_scaling, - _opt_linear_term, - self.observed_subgrad) - + self._setup_sampler_data = (A_scaling, + b_scaling, + _opt_linear_term, + self.observed_subgrad) + self.opt_linear = _opt_linear_term return active_signs + def setup_inference(self, + dispersion): + + if self.num_opt_var > 0: + self._setup_sampler(*self._setup_sampler_data, + dispersion=dispersion) + + # Targets of inference # and covariance with score representation # are same as LASSO @@ -187,7 +194,7 @@ def gaussian(X, slope_weights, sigma=1., quadratic=None, - ridge_term=0., + ridge_term=None, randomizer_scale=None): loglike = rr.glm.gaussian(X, Y, coef=1. / sigma ** 2, quadratic=quadratic) @@ -207,6 +214,401 @@ def gaussian(X, ridge_term, randomizer) +# split SLOPE + +class split_slope(lasso): + + """ + Data split, then LASSO (i.e. data carving) + """ + + def __init__(self, + loglike, + slope_weights, + proportion_select, + ridge_term=0, + perturb=None, + estimate_dispersion=True): + + (self.loglike, + self.slope_weights, + self.proportion_select, + self.ridge_term) = (loglike, + slope_weights, + proportion_select, + ridge_term) + + self.nfeature = p = self.loglike.shape[0] + self.penalty = rr.slope(slope_weights, lagrange=1.) + self._initial_omega = perturb # random perturbation + self.estimate_dispersion = estimate_dispersion + + def fit(self, + solve_args={'tol': 1.e-12, 'min_its': 50}, + perturb=None): + + signs = slope.fit(self, + solve_args=solve_args, + perturb=perturb) + + # for data splitting randomization, + # we need to estimate a dispersion parameter + + # we then setup up the sampler again + df_fit = len(self.selection_variable['variables']) + + if self.estimate_dispersion: + + X, y = self.loglike.data + n, p = X.shape + + dispersion = 2 * (self.loglike.smooth_objective(self._beta_full, + 'func') / + (n - df_fit)) + + self.dispersion_ = dispersion + # run setup again after + # estimating dispersion + + self.df_fit = df_fit + + return signs + + + def setup_inference(self, + dispersion): + + if self.df_fit > 0: + + if dispersion is None: + self._setup_sampler(*self._setup_sampler_data, + dispersion=self.dispersion_) + + else: + self._setup_sampler(*self._setup_sampler_data, + dispersion=dispersion) + + def _setup_implied_gaussian(self, + opt_linear, + observed_subgrad, + dispersion=1): + + # key observation is that the covariance of the added noise is + # roughly dispersion * (1 - pi) / pi * X^TX (in OLS regression, similar for other + # models), so the precision is (X^TX)^{-1} * (pi / ((1 - pi) * dispersion)) + # and prec.dot(opt_linear) = S_E / (dispersion * (1 - pi) / pi) + # because opt_linear has shape p x E with the columns + # being those non-zero columns of the solution. Above S_E = np.diag(signs) + # the conditional precision is S_E Q[E][:,E] * pi / ((1 - pi) * dispersion) S_E + # and regress_opt is -Q[E][:,E]^{-1} S_E + # padded with zeros + # to be E x p + + pi_s = self.proportion_select + ratio = (1 - pi_s) / pi_s + + ordered_vars = self.selection_variable['variables'] + indices = self.selection_variable['indices'] + signs_cluster = self.selection_variable['signs_cluster'] + + # JT: this may be expensive to form -- not pxp but large + cond_precision = signs_cluster.T.dot(self.opt_linear[indices] / (dispersion * ratio)) + + assert(np.linalg.norm(cond_precision - cond_precision.T) / + np.linalg.norm(cond_precision) < 1.e-6) + cond_cov = np.linalg.inv(cond_precision) + regress_opt = np.zeros((len(ordered_vars), + self.nfeature)) + # JT: not sure this is right -- had to remove signs + regress_opt[:, ordered_vars] = -cond_cov / (dispersion * ratio) + cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) + + ## probably missing a dispersion in the denominator + prod_score_prec_unnorm = np.identity(self.nfeature) / (dispersion * ratio) + + ## probably missing a multiplicative factor of ratio + cov_rand = self._unscaled_cov_score * (dispersion * ratio) + + M1 = prod_score_prec_unnorm * dispersion + M2 = M1.dot(cov_rand).dot(M1.T) + M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + + # would be nice to not store these? + + self.M1 = M1 + self.M2 = M2 + self.M3 = M3 + + return (cond_mean, + cond_cov, + cond_precision, + M1, + M2, + M3) + + def _solve_randomized_problem(self, + # optional binary vector + # indicating selection data + perturb=None, + solve_args={'tol': 1.e-12, 'min_its': 50}): + + # take a new perturbation if none supplied + if perturb is not None: + self._selection_idx = perturb + if not hasattr(self, "_selection_idx"): + X, y = self.loglike.data + total_size = n = X.shape[0] + pi_s = self.proportion_select + self._selection_idx = np.zeros(n, np.bool) + self._selection_idx[:int(pi_s*n)] = True + np.random.shuffle(self._selection_idx) + + inv_frac = 1 / self.proportion_select + quad = rr.identity_quadratic(self.ridge_term, + 0, + 0, + 0) + + randomized_loss = self.loglike.subsample(self._selection_idx) + randomized_loss.coef *= inv_frac + + problem = rr.simple_problem(randomized_loss, self.penalty) + observed_soln = problem.solve(quad, **solve_args) + observed_subgrad = -(randomized_loss.smooth_objective(observed_soln, + 'grad') + + quad.objective(observed_soln, 'grad')) + + return observed_soln, observed_subgrad + + @staticmethod + def gaussian(X, + Y, + slope_weights, + proportion, + sigma=1., + quadratic=None, + estimate_dispersion=True): + r""" + Squared-error LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + + \beta \mapsto \frac{1}{2} \|Y-X\beta\|^2_2 + + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\lambda$ is `slope_weights`. The ridge term + is determined by the Hessian and `np.std(Y)` by default. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + Y : ndarray + Shape (n,) -- the response. + + slope_weights: [float, sequence] + + proportion: float + What proportion of data to use for selection. + + sigma : float (optional) + Noise variance. Set to 1 if `covariance_estimator` is not None. + This scales the loglikelihood by `sigma**(-2)`. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.slope.slope` + + """ + + loglike = rr.glm.gaussian(X, + Y, + coef=1. / sigma ** 2, + quadratic=quadratic) + + return split_slope(loglike, + np.asarray(slope_weights)/sigma**2, + proportion, + estimate_dispersion=estimate_dispersion) + + + @staticmethod + def logistic(X, + successes, + slope_weights, + proportion, + trials=None, + quadratic=None): + r""" + Logistic LASSO with feature weights (before randomization) + + .. math:: + + \beta \mapsto \ell(X\beta) + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell$ is the negative of the logistic + log-likelihood (half the logistic deviance) + and $\lambda$ is `slope_weights`. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + successes : ndarray + Shape (n,) -- response vector. An integer number of successes. + For data that is proportions, multiply the proportions + by the number of trials first. + + slope_weights: [float, sequence] + + proportion: float + What proportion of data to use for selection. + + trials : ndarray (optional) + Number of trials per response, defaults to + ones the same shape as Y. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.slope.slope` + + """ + + loglike = rr.glm.logistic(X, + successes, + trials=trials, + quadratic=quadratic) + + return split_slope(loglike, + np.asarray(slope_weights), + proportion) + + @staticmethod + def coxph(X, + times, + status, + slope_weights, + proportion, + quadratic=None): + r""" + Cox proportional hazards LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + + \beta \mapsto \ell^{\text{Cox}}(\beta) + + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell^{\text{Cox}}$ is the + negative of the log of the Cox partial + likelihood and $\lambda$ is `slope_weights`. + Uses Efron's tie breaking method. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + times : ndarray + Shape (n,) -- the survival times. + + status : ndarray + Shape (n,) -- the censoring status. + + slope_weights: [float, sequence] + + + proportion: float + What proportion of data to use for selection. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.slope.slope` + + """ + n, p = X.shape + loglike = rr.glm.cox(X, times, status, quadratic=quadratic) + + return split_slope(loglike, + np.asarray(slope_weights), + proportion) + + @staticmethod + def poisson(X, + counts, + slope_weights, + proportion, + quadratic=None, + ridge_term=0): + r""" + Poisson log-linear LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + + \beta \mapsto \ell^{\text{Poisson}}(\beta) + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\ell^{\text{Poisson}}$ is the negative + of the log of the Poisson likelihood (half the deviance) + and $\lambda$ is `slope_weights`. + + Parameters + ---------- + + X : ndarray + Shape (n,p) -- the design matrix. + + counts : ndarray + Shape (n,) -- the response. + + slope_weights: [float, sequence] + + proportion: float + What proportion of data to use for selection. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + Returns + ------- + + L : `selection.randomized.slope.slope` + + """ + loglike = rr.glm.poisson(X, counts, quadratic=quadratic) + + return split_slope(loglike, + np.asarray(slope_weights), + proportion) + + + # Projection onto selected subgradients of SLOPE def _projection_onto_selected_subgradients(prox_arg, From 8d05d911845bfec4d2e0032ff3f93e62f99e8625 Mon Sep 17 00:00:00 2001 From: Snigdha Panigrahi Date: Mon, 13 Dec 2021 21:27:49 -0500 Subject: [PATCH 175/187] moved U1-5 calculations from methods to base --- selectinf/base.py | 15 ++++++ selectinf/randomized/base.py | 16 +++--- selectinf/randomized/posterior_inference.py | 27 +++++++--- selectinf/randomized/selective_MLE.py | 55 ++++++++++----------- 4 files changed, 71 insertions(+), 42 deletions(-) diff --git a/selectinf/base.py b/selectinf/base.py index 51c09ba85..9371d62d3 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -269,3 +269,18 @@ def _pearsonX2(y, n = y.shape[0] resid = y - loglike.saturated_loss.mean_function(linpred) return (resid ** 2 / W).sum() / (n - df_fit) + +def target_query_Interactspec(query_spec, + regress_target_score, + cov_target): + + QS = query_spec + prec_target = np.linalg.inv(cov_target) + + U1 = regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + + return U1, U2, U3, U4, U5 \ No newline at end of file diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index 279e94dd7..cdaf21ca5 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -2,6 +2,7 @@ import numpy as np, pandas as pd from .selective_MLE import mle_inference +from ..base import target_query_Interactspec class ConditionalSpec(NamedTuple): @@ -230,14 +231,13 @@ def _construct_density(self): for m in range(self.ntarget): observed_target_uni = (TS.observed_target[m]).reshape((1,)) cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) - prec_target = 1. / cov_target_uni regress_target_score_uni = TS.regress_target_score[m, :].reshape((1, p)) - U1 = regress_target_score_uni.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + U1, U2, U3, U4, U5 = target_query_Interactspec(QS, + regress_target_score_uni, + cov_target_uni) + + prec_target = 1. / cov_target_uni # JT: what is _T? _T = QS.cond_cov.dot(U5.T) @@ -265,3 +265,7 @@ def _construct_density(self): return self.conditional_spec + + + + diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 7ab09195b..256a5ae78 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -8,6 +8,7 @@ from ..algorithms.barrier_affine import solve_barrier_affine_py from .selective_MLE import mle_inference +from ..base import target_query_Interactspec class PosteriorAtt(typing.NamedTuple): @@ -131,14 +132,12 @@ def _get_marginal_parameters(self): QS = self.query_spec TS = self.target_spec + U1, U2, U3, U4, U5 = target_query_Interactspec(QS, + TS.regress_target_score, + TS.cov_target) + prec_target = np.linalg.inv(TS.cov_target) cond_precision = np.linalg.inv(QS.cond_cov) - - U1 = TS.regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) prec_target_nosel = prec_target + U2 - U3 @@ -282,3 +281,19 @@ def __next__(self): self.state[:] = candidate break return self.state + + +def target_query_Interactspec(query_spec, + regress_target_score, + cov_target): + + QS = query_spec + prec_target = np.linalg.inv(cov_target) + + U1 = regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + + return U1, U2, U3, U4, U5 diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 76bd8907b..0fff47de6 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -4,6 +4,7 @@ from scipy.stats import norm as ndist from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from ..algorithms.barrier_affine import solve_barrier_affine_py +from ..base import target_query_Interactspec class mle_inference(object): @@ -21,11 +22,21 @@ def solve_estimating_eqn(self, useC=False, level=0.90): - prec_target_nosel, bias_target, U3, U5 = _setup_estimating_eqn(self.query_spec, - self.target_spec) - QS = self.query_spec TS = self.target_spec + + U1, U2, U3, U4, U5= target_query_Interactspec(QS, + TS.regress_target_score, + TS.cov_target) + + prec_target = np.linalg.inv(TS.cov_target) + + prec_target_nosel = prec_target + U2 - U3 + + _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) + + bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) + + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) cond_precision = np.linalg.inv(QS.cond_cov) conjugate_arg = cond_precision.dot(QS.cond_mean) @@ -90,33 +101,17 @@ def solve_estimating_eqn(self, return result, observed_info_mean, log_ref -def _setup_estimating_eqn(query_spec, - target_spec): - - QS = query_spec - TS = target_spec - - prec_target = np.linalg.inv(TS.cov_target) - U1 = TS.regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) - - prec_target_nosel = prec_target + U2 - U3 - - _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) - - bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) - + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) - - return prec_target_nosel, bias_target, U3, U5 - - - - - - +def target_query_Interactspec(query_spec, + regress_target_score, + cov_target): + QS = query_spec + prec_target = np.linalg.inv(cov_target) + U1 = regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) + U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + return U1, U2, U3, U4, U5 From a63dae6daa1cfd04babb08ec81a161d84d2c4ac4 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 10 Jan 2022 16:18:36 -0800 Subject: [PATCH 176/187] WIP: screeening --- selectinf/randomized/screening.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/selectinf/randomized/screening.py b/selectinf/randomized/screening.py index 0b61626b0..1e24c73bf 100644 --- a/selectinf/randomized/screening.py +++ b/selectinf/randomized/screening.py @@ -134,10 +134,10 @@ def fit(self, perturb=None): A_scaling = -np.identity(len(active_signs)) b_scaling = np.zeros(self.num_opt_var) - self._setup_sampler(A_scaling, - b_scaling, - opt_linear, - observed_subgrad) + self._setup_sampler_data = (A_scaling, + b_scaling, + opt_linear, + observed_subgrad) return self._selected @@ -237,10 +237,10 @@ def fit(self, perturb=None): A_scaling = -np.identity(self.num_opt_var) b_scaling = np.zeros(self.num_opt_var) - self._setup_sampler(A_scaling, - b_scaling, - opt_linear, - observed_subgrad) + self._setup_sampler_data = (A_scaling, + b_scaling, + opt_linear, + observed_subgrad) else: self._selected = np.zeros(p, np.bool) return self._selected @@ -374,10 +374,10 @@ def fit(self, perturb=None): A_scaling = -np.identity(self.num_opt_var) b_scaling = -np.ones(self.num_opt_var) * lower_bound - self._setup_sampler(A_scaling, - b_scaling, - opt_linear, - observed_subgrad) + self._setup_sampler_data = (A_scaling, + b_scaling, + opt_linear, + observed_subgrad) return self._selected From a4818570252dde5d7a82e4a10fc023f1e179346b Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 10 Jan 2022 17:29:09 -0800 Subject: [PATCH 177/187] removed redundant interaction functions; have the U quantities computed in a class method --- selectinf/base.py | 22 +- selectinf/randomized/approx_reference.py | 278 +++++++++++++++++++- selectinf/randomized/base.py | 276 +------------------ selectinf/randomized/exact_reference.py | 2 +- selectinf/randomized/lasso.py | 12 +- selectinf/randomized/posterior_inference.py | 34 +-- selectinf/randomized/query.py | 26 +- selectinf/randomized/selective_MLE.py | 36 ++- selectinf/randomized/slope.py | 9 +- 9 files changed, 355 insertions(+), 340 deletions(-) diff --git a/selectinf/base.py b/selectinf/base.py index 9371d62d3..d2b9d9a1b 100644 --- a/selectinf/base.py +++ b/selectinf/base.py @@ -243,14 +243,14 @@ def _compute_hessian(loglike, _right = np.zeros((n, bool_idx.sum())) for i, j in enumerate(np.nonzero(bool_idx)[0]): _right[:,i] = loglike.saturated_loss.hessian_mult(linpred, - X[:,j], - case_weights=loglike.saturated_loss.case_weights) + X[:,j], + case_weights=loglike.saturated_loss.case_weights) parts.append(X.T.dot(_right)) _hessian = np.zeros_like(X) for i in range(X.shape[1]): _hessian[:,i] = loglike.saturated_loss.hessian_mult(linpred, - X[:,i], - case_weights=loglike.saturated_loss.case_weights) + X[:,i], + case_weights=loglike.saturated_loss.case_weights) _hessian = X.T.dot(_hessian) else: raise ValueError('saturated_loss has no hessian or hessian_mult method') @@ -270,17 +270,3 @@ def _pearsonX2(y, resid = y - loglike.saturated_loss.mean_function(linpred) return (resid ** 2 / W).sum() / (n - df_fit) -def target_query_Interactspec(query_spec, - regress_target_score, - cov_target): - - QS = query_spec - prec_target = np.linalg.inv(cov_target) - - U1 = regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) - - return U1, U2, U3, U4, U5 \ No newline at end of file diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index dd27e98b3..6feaca6db 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -1,12 +1,286 @@ - from __future__ import division, print_function +from typing import NamedTuple import numpy as np, pandas as pd from scipy.interpolate import interp1d from ..distributions.discrete_family import discrete_family from ..algorithms.barrier_affine import solve_barrier_affine_py -from .base import grid_inference +from .selective_MLE import mle_inference +from .base import target_query_Interactspec + +class ConditionalSpec(NamedTuple): + + # description of (preselection) conditional law of + # targets \hat{\theta} | u, N + # if they were unbiased, then: + # 1) precision will agree with marginal variance + # 2) scalings will all be 1 + # 3) shifts will be 0 + + precision : np.ndarray + scalings : np.ndarray + shifts : np.ndarray + T : np.ndarray # what is T? + +class grid_inference(object): + + def __init__(self, + query_spec, + target_spec, + solve_args={'tol': 1.e-12}, + ngrid=1000): + + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + query : `gaussian_query` + A Gaussian query which has information + to describe implied Gaussian. + observed_target : ndarray + Observed estimate of target. + cov_target : ndarray + Estimated covaraince of target. + cov_target_score : ndarray + Estimated covariance of target and score of randomized query. + solve_args : dict, optional + Arguments passed to solver. + """ + + self.query_spec = query_spec + self.target_spec = target_spec + self.solve_args = solve_args + self.ngrid = ngrid + + G = mle_inference(query_spec, + target_spec, + solve_args=solve_args) + + _, inverse_info, log_ref = G.solve_estimating_eqn() + + TS = target_spec + self.ntarget = ntarget = TS.cov_target.shape[0] + _scale = 4 * np.sqrt(np.diag(inverse_info)) + self.inverse_info = inverse_info + + self.stat_grid = np.zeros((ntarget, ngrid)) + for j in range(ntarget): + self.stat_grid[j, :] = np.linspace(TS.observed_target[j] - 1.5 * _scale[j], + TS.observed_target[j] + 1.5 * _scale[j], + num=ngrid) + + def summary(self, + alternatives=None, + parameter=None, + level=0.9): + """ + Produce p-values and confidence intervals for targets + of model including selected features + Parameters + ---------- + alternatives : [str], optional + Sequence of strings describing the alternatives, + should be values of ['twosided', 'less', 'greater'] + parameter : np.array + Hypothesized value for parameter -- defaults to 0. + level : float + Confidence level. + """ + + TS = self.target_spec + + if parameter is not None: + pivots = self._pivots(parameter, + alternatives=alternatives) + else: + pivots = None + + pvalues = self._pivots(np.zeros_like(TS.observed_target), + alternatives=alternatives) + lower, upper = self._intervals(level=level) + + result = pd.DataFrame({'target': TS.observed_target, + 'pvalue': pvalues, + 'alternative': alternatives, + 'lower_confidence': lower, + 'upper_confidence': upper}) + + if not np.all(parameter == 0): + result.insert(4, 'pivot', pivots) + result.insert(5, 'parameter', parameter) + + return result + + def _approx_log_reference(self, + observed_target, + cov_target, + linear_coef, + grid): + + """ + Approximate the log of the reference density on a grid. + """ + if np.asarray(observed_target).shape in [(), (0,)]: + raise ValueError('no target specified') + + ref_hat = [] + solver = solve_barrier_affine_py + + for k in range(grid.shape[0]): + # in the usual D = N + Gamma theta.hat, + # regress_opt_target is "something" times Gamma, + # where "something" comes from implied Gaussian + # cond_mean is "something" times D + # Gamma is cov_target_score.T.dot(prec_target) + + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) + conjugate_arg = self.cond_precision.dot(cond_mean_grid) + + val, _, _ = solver(conjugate_arg, + self.cond_precision, + self.observed_soln, + self.linear_part, + self.offset, + **self.solve_args) + + ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) + + return np.asarray(ref_hat) + + def _pivots(self, + mean_parameter, + alternatives=None): + + TS = self.target_spec + + if not hasattr(self, "_families"): + self._construct_density() # generic + self._construct_families() # specific to the method + precs, S, r, _ = self.conditional_spec + + if alternatives is None: + alternatives = ['twosided'] * self.ntarget + + pivot = [] + + for m in range(self.ntarget): + + family = self._families[m] + var_target = 1. / (precs[m][0, 0]) + + mean = S[m].dot(mean_parameter[m].reshape((1,))) + r[m] + # construction of pivot from families follows `selectinf.learning.core` + + _cdf = family.cdf((mean[0] - TS.observed_target[m]) / var_target, x=TS.observed_target[m]) + + if alternatives[m] == 'twosided': + pivot.append(2 * min(_cdf, 1 - _cdf)) + elif alternatives[m] == 'greater': + pivot.append(1 - _cdf) + elif alternatives[m] == 'less': + pivot.append(_cdf) + else: + raise ValueError('alternative should be in ["twosided", "less", "greater"]') + return pivot # , self._log_ref + + def _intervals(self, + level=0.9): + + TS = self.target_spec + + if not hasattr(self, "_families"): + self._construct_density() # generic + self._construct_families() # specific to the method + + precs, S, r, _ = self.conditional_spec + + lower, upper = [], [] + + for m in range(self.ntarget): + # construction of intervals from families follows `selectinf.learning.core` + family = self._families[m] + observed_target = TS.observed_target[m] + unbiased_est = (observed_target - r[m][0]) * (1./(S[m][0,0])) + + _l, _u = family.equal_tailed_interval(observed_target, + alpha=1 - level) + l = _l * (1./(S[m][0,0])) + u = _u * (1./(S[m][0,0])) + + var_target = 1. / (precs[m][0, 0]) + + lower.append(l * var_target + unbiased_est) + upper.append(u * var_target + unbiased_est) + + return np.asarray(lower), np.asarray(upper) + + ### Private method + + def _construct_density(self): + """ + What is this method doing? + """ + + TS = self.target_spec + QS = self.query_spec + + precs = [] + S = [] + r = [] + T = [] + + p = TS.regress_target_score.shape[1] + + for m in range(self.ntarget): + observed_target_uni = (TS.observed_target[m]).reshape((1,)) + cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) + regress_target_score_uni = TS.regress_target_score[m, :].reshape((1, p)) + + U1, U2, U3, U4, U5 = self._form_interaction_pieces(QS, + regress_target_score_uni, + cov_target_uni) + + prec_target = 1. / cov_target_uni + + # JT: what is _T? + _T = QS.cond_cov.dot(U5.T) + + prec_target_nosel = prec_target + U2 - U3 + + _P = -(U1.T.dot(QS.M5) + U2.dot(observed_target_uni)) + + bias_target = cov_target_uni.dot( + U1.T.dot(-U4.dot(observed_target_uni) + QS.M4.dot(QS.cond_mean)) - _P) + + _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) + _S = np.linalg.inv(prec_target_nosel).dot(prec_target) + + S.append(_S) + r.append(_r) + precs.append(prec_target_nosel) + T.append(_T) + + self.conditional_spec = ConditionalSpec(np.array(precs), + np.array(S), + np.array(r), + np.array(T) # what is T here? + ) + + return self.conditional_spec + + # Private + + def _form_interaction_pieces(self, + QS, + regress_target_score, + cov_target): + + return target_query_Interactspec(QS, + regress_target_score, + cov_target) + class approximate_grid_inference(grid_inference): diff --git a/selectinf/randomized/base.py b/selectinf/randomized/base.py index cdaf21ca5..5a25ff11e 100644 --- a/selectinf/randomized/base.py +++ b/selectinf/randomized/base.py @@ -1,271 +1,19 @@ -from typing import NamedTuple -import numpy as np, pandas as pd +import numpy as np -from .selective_MLE import mle_inference -from ..base import target_query_Interactspec +def target_query_Interactspec(query_spec, + regress_target_score, + cov_target): -class ConditionalSpec(NamedTuple): - - # description of (preselection) conditional law of - # targets \hat{\theta} | u, N - # if they were unbiased, then: - # 1) precision will agree with marginal variance - # 2) scalings will all be 1 - # 3) shifts will be 0 - - precision : np.ndarray - scalings : np.ndarray - shifts : np.ndarray - T : np.ndarray # what is T? - -class grid_inference(object): - - def __init__(self, - query_spec, - target_spec, - solve_args={'tol': 1.e-12}, - ngrid=1000): - - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - query : `gaussian_query` - A Gaussian query which has information - to describe implied Gaussian. - observed_target : ndarray - Observed estimate of target. - cov_target : ndarray - Estimated covaraince of target. - cov_target_score : ndarray - Estimated covariance of target and score of randomized query. - solve_args : dict, optional - Arguments passed to solver. - """ - - self.query_spec = query_spec - self.target_spec = target_spec - self.solve_args = solve_args - self.ngrid = ngrid - - G = mle_inference(query_spec, - target_spec, - solve_args=solve_args) - - _, inverse_info, log_ref = G.solve_estimating_eqn() - - TS = target_spec - self.ntarget = ntarget = TS.cov_target.shape[0] - _scale = 4 * np.sqrt(np.diag(inverse_info)) - self.inverse_info = inverse_info - - self.stat_grid = np.zeros((ntarget, ngrid)) - for j in range(ntarget): - self.stat_grid[j, :] = np.linspace(TS.observed_target[j] - 1.5 * _scale[j], - TS.observed_target[j] + 1.5 * _scale[j], - num=ngrid) - - def summary(self, - alternatives=None, - parameter=None, - level=0.9): - """ - Produce p-values and confidence intervals for targets - of model including selected features - Parameters - ---------- - alternatives : [str], optional - Sequence of strings describing the alternatives, - should be values of ['twosided', 'less', 'greater'] - parameter : np.array - Hypothesized value for parameter -- defaults to 0. - level : float - Confidence level. - """ - - TS = self.target_spec - - if parameter is not None: - pivots = self._pivots(parameter, - alternatives=alternatives) - else: - pivots = None - - pvalues = self._pivots(np.zeros_like(TS.observed_target), - alternatives=alternatives) - lower, upper = self._intervals(level=level) - - result = pd.DataFrame({'target': TS.observed_target, - 'pvalue': pvalues, - 'alternative': alternatives, - 'lower_confidence': lower, - 'upper_confidence': upper}) - - if not np.all(parameter == 0): - result.insert(4, 'pivot', pivots) - result.insert(5, 'parameter', parameter) - - return result - - def _approx_log_reference(self, - observed_target, - cov_target, - linear_coef, - grid): - - """ - Approximate the log of the reference density on a grid. - """ - if np.asarray(observed_target).shape in [(), (0,)]: - raise ValueError('no target specified') - - ref_hat = [] - solver = solve_barrier_affine_py - - for k in range(grid.shape[0]): - # in the usual D = N + Gamma theta.hat, - # regress_opt_target is "something" times Gamma, - # where "something" comes from implied Gaussian - # cond_mean is "something" times D - # Gamma is cov_target_score.T.dot(prec_target) - - cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + self.cond_mean) - conjugate_arg = self.cond_precision.dot(cond_mean_grid) - - val, _, _ = solver(conjugate_arg, - self.cond_precision, - self.observed_soln, - self.linear_part, - self.offset, - **self.solve_args) - - ref_hat.append(-val - (conjugate_arg.T.dot(self.cond_cov).dot(conjugate_arg) / 2.)) - - return np.asarray(ref_hat) - - def _pivots(self, - mean_parameter, - alternatives=None): - - TS = self.target_spec - - if not hasattr(self, "_families"): - self._construct_density() # generic - self._construct_families() # specific to the method - precs, S, r, _ = self.conditional_spec - - if alternatives is None: - alternatives = ['twosided'] * self.ntarget - - pivot = [] - - for m in range(self.ntarget): - - family = self._families[m] - var_target = 1. / (precs[m][0, 0]) - - mean = S[m].dot(mean_parameter[m].reshape((1,))) + r[m] - # construction of pivot from families follows `selectinf.learning.core` - - _cdf = family.cdf((mean[0] - TS.observed_target[m]) / var_target, x=TS.observed_target[m]) - - if alternatives[m] == 'twosided': - pivot.append(2 * min(_cdf, 1 - _cdf)) - elif alternatives[m] == 'greater': - pivot.append(1 - _cdf) - elif alternatives[m] == 'less': - pivot.append(_cdf) - else: - raise ValueError('alternative should be in ["twosided", "less", "greater"]') - return pivot # , self._log_ref - - def _intervals(self, - level=0.9): - - TS = self.target_spec - - if not hasattr(self, "_families"): - self._construct_density() # generic - self._construct_families() # specific to the method - - precs, S, r, _ = self.conditional_spec - - lower, upper = [], [] - - for m in range(self.ntarget): - # construction of intervals from families follows `selectinf.learning.core` - family = self._families[m] - observed_target = TS.observed_target[m] - unbiased_est = (observed_target - r[m][0]) * (1./(S[m][0,0])) - - _l, _u = family.equal_tailed_interval(observed_target, - alpha=1 - level) - l = _l * (1./(S[m][0,0])) - u = _u * (1./(S[m][0,0])) - - var_target = 1. / (precs[m][0, 0]) - - lower.append(l * var_target + unbiased_est) - upper.append(u * var_target + unbiased_est) - - return np.asarray(lower), np.asarray(upper) - - ### Private method - - def _construct_density(self): - """ - What is this method doing? - """ - - TS = self.target_spec - QS = self.query_spec - - precs = [] - S = [] - r = [] - T = [] - - p = TS.regress_target_score.shape[1] - - for m in range(self.ntarget): - observed_target_uni = (TS.observed_target[m]).reshape((1,)) - cov_target_uni = (np.diag(TS.cov_target)[m]).reshape((1, 1)) - regress_target_score_uni = TS.regress_target_score[m, :].reshape((1, p)) - - U1, U2, U3, U4, U5 = target_query_Interactspec(QS, - regress_target_score_uni, - cov_target_uni) - - prec_target = 1. / cov_target_uni - - # JT: what is _T? - _T = QS.cond_cov.dot(U5.T) - - prec_target_nosel = prec_target + U2 - U3 - - _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(observed_target_uni)) - - bias_target = cov_target_uni.dot( - U1.T.dot(-U4.dot(observed_target_uni) + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) - - _r = np.linalg.inv(prec_target_nosel).dot(prec_target.dot(bias_target)) - _S = np.linalg.inv(prec_target_nosel).dot(prec_target) - - S.append(_S) - r.append(_r) - precs.append(prec_target_nosel) - T.append(_T) - - self.conditional_spec = ConditionalSpec(np.array(precs), - np.array(S), - np.array(r), - np.array(T) # what is T here? - ) - - return self.conditional_spec + QS = query_spec + prec_target = np.linalg.inv(cov_target) + U1 = regress_target_score.T.dot(prec_target) + U2 = U1.T.dot(QS.M2.dot(U1)) + U3 = U1.T.dot(QS.M3.dot(U1)) + U5 = U1.T.dot(QS.M4) + U4 = QS.M4.dot(QS.cond_cov).dot(U5.T) + return U1, U2, U3, U4, U5 diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index dbc7711da..ebc8cbd26 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -4,7 +4,7 @@ from scipy.stats import norm as ndist from ..distributions.discrete_family import discrete_family -from .base import grid_inference +from .approx_reference import grid_inference class exact_grid_inference(grid_inference): diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index 6f71819d0..e37023a9b 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -789,27 +789,33 @@ def _setup_implied_gaussian(self, cond_mean = regress_opt.dot(self.observed_score_state + observed_subgrad) ## probably missing a dispersion in the denominator + # this might be too big -- use a linear_transform instead prod_score_prec_unnorm = np.identity(self.nfeature) / (dispersion * ratio) ## probably missing a multiplicative factor of ratio cov_rand = self._unscaled_cov_score * (dispersion * ratio) M1 = prod_score_prec_unnorm * dispersion + M4 = M1.dot(opt_linear) M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + M3 = M4.dot(cond_cov).dot(M4.T) # would be nice to not store these? self.M1 = M1 self.M2 = M2 self.M3 = M3 - + self.M4 = M4 + self.M5 = M1.dot(self.observed_score_state + observed_subgrad) + return (cond_mean, cond_cov, cond_precision, M1, M2, - M3) + M3, + self.M4, + self.M5) def _solve_randomized_problem(self, # optional binary vector diff --git a/selectinf/randomized/posterior_inference.py b/selectinf/randomized/posterior_inference.py index 256a5ae78..1dd16572f 100644 --- a/selectinf/randomized/posterior_inference.py +++ b/selectinf/randomized/posterior_inference.py @@ -8,7 +8,7 @@ from ..algorithms.barrier_affine import solve_barrier_affine_py from .selective_MLE import mle_inference -from ..base import target_query_Interactspec +from .base import target_query_Interactspec class PosteriorAtt(typing.NamedTuple): @@ -132,19 +132,19 @@ def _get_marginal_parameters(self): QS = self.query_spec TS = self.target_spec - U1, U2, U3, U4, U5 = target_query_Interactspec(QS, - TS.regress_target_score, - TS.cov_target) + U1, U2, U3, U4, U5 = self._form_interaction_pieces(QS, + TS.regress_target_score, + TS.cov_target) prec_target = np.linalg.inv(TS.cov_target) cond_precision = np.linalg.inv(QS.cond_cov) prec_target_nosel = prec_target + U2 - U3 - _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) + _P = -(U1.T.dot(QS.M5) + U2.dot(TS.observed_target)) bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) + - QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) + QS.M4.dot(QS.cond_mean)) - _P) ###set parameters for the marginal distribution of optimization variables @@ -165,6 +165,14 @@ def _get_marginal_parameters(self): S, prec_target_nosel) + def _form_interaction_pieces(self, + QS, + regress_target_score, + cov_target): + + return target_query_Interactspec(QS, + regress_target_score, + cov_target) ### sampling methods def langevin_sampler(selective_posterior, @@ -283,17 +291,3 @@ def __next__(self): return self.state -def target_query_Interactspec(query_spec, - regress_target_score, - cov_target): - - QS = query_spec - prec_target = np.linalg.inv(cov_target) - - U1 = regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) - - return U1, U2, U3, U4, U5 diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index f67ba3ec1..32c86d0a0 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -25,10 +25,11 @@ class QuerySpec(NamedTuple): # score / randomization relationship - M1 : np.ndarray M2 : np.ndarray M3 : np.ndarray - + M4 : np.ndarray + M5 : np.ndarray + # observed values observed_opt_state : np.ndarray @@ -74,9 +75,10 @@ def specification(self): opt_linear=self.opt_linear, linear_part=self.affine_con.linear_part, offset=self.affine_con.offset, - M1=self.M1, M2=self.M2, M3=self.M3, + M4=self.M4, + M5=self.M5, observed_opt_state=self.observed_opt_state, observed_score_state=self.observed_score_state, observed_subgrad=self.observed_subgrad, @@ -136,12 +138,9 @@ def _setup_sampler(self, (cond_mean, cond_cov, - cond_precision, - M1, - M2, - M3) = self._setup_implied_gaussian(opt_linear, + cond_precision) = self._setup_implied_gaussian(opt_linear, observed_subgrad, - dispersion=dispersion) + dispersion=dispersion)[:3] self.cond_mean, self.cond_cov = cond_mean, cond_cov @@ -181,18 +180,23 @@ def _setup_implied_gaussian(self, M1 = prod_score_prec_unnorm * dispersion M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + M4 = M1.dot(opt_linear) + M3 = M4.dot(cond_cov).dot(M4.T) self.M1 = M1 self.M2 = M2 self.M3 = M3 - + self.M4 = M4 + self.M5 = M1.dot(self.observed_score_state + observed_subgrad) + return (cond_mean, cond_cov, cond_precision, M1, M2, - M3) + M3, + self.M4, + self.M5) def inference(self, target_spec, diff --git a/selectinf/randomized/selective_MLE.py b/selectinf/randomized/selective_MLE.py index 0fff47de6..cc7aed4a2 100644 --- a/selectinf/randomized/selective_MLE.py +++ b/selectinf/randomized/selective_MLE.py @@ -2,9 +2,10 @@ import numpy as np, pandas as pd from scipy.stats import norm as ndist -from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C from ..algorithms.barrier_affine import solve_barrier_affine_py -from ..base import target_query_Interactspec + +from .selective_MLE_utils import solve_barrier_affine as solve_barrier_affine_C +from .base import target_query_Interactspec class mle_inference(object): @@ -25,18 +26,18 @@ def solve_estimating_eqn(self, QS = self.query_spec TS = self.target_spec - U1, U2, U3, U4, U5= target_query_Interactspec(QS, - TS.regress_target_score, - TS.cov_target) + U1, U2, U3, U4, U5 = self._form_interaction_pieces(QS, + TS.regress_target_score, + TS.cov_target) prec_target = np.linalg.inv(TS.cov_target) prec_target_nosel = prec_target + U2 - U3 - _P = -(U1.T.dot(QS.M1.dot(QS.observed_score)) + U2.dot(TS.observed_target)) + _P = -(U1.T.dot(QS.M5) + U2.dot(TS.observed_target)) bias_target = TS.cov_target.dot(U1.T.dot(-U4.dot(TS.observed_target) - + QS.M1.dot(QS.opt_linear.dot(QS.cond_mean))) - _P) + + QS.M4.dot(QS.cond_mean)) - _P) cond_precision = np.linalg.inv(QS.cond_cov) conjugate_arg = cond_precision.dot(QS.cond_mean) @@ -54,7 +55,7 @@ def solve_estimating_eqn(self, **self.solve_args) final_estimator = TS.cov_target.dot(prec_target_nosel).dot(TS.observed_target) \ - + TS.regress_target_score.dot(QS.M1.dot(QS.opt_linear)).dot(QS.cond_mean - soln) \ + + TS.regress_target_score.dot(QS.M4).dot(QS.cond_mean - soln) \ - bias_target observed_info_natural = prec_target_nosel + U3 - U5.dot(hess.dot(U5.T)) @@ -101,17 +102,14 @@ def solve_estimating_eqn(self, return result, observed_info_mean, log_ref -def target_query_Interactspec(query_spec, - regress_target_score, - cov_target): + # Private - QS = query_spec - prec_target = np.linalg.inv(cov_target) + def _form_interaction_pieces(self, + QS, + regress_target_score, + cov_target): - U1 = regress_target_score.T.dot(prec_target) - U2 = U1.T.dot(QS.M2.dot(U1)) - U3 = U1.T.dot(QS.M3.dot(U1)) - U4 = QS.M1.dot(QS.opt_linear).dot(QS.cond_cov).dot(QS.opt_linear.T.dot(QS.M1.T.dot(U1))) - U5 = U1.T.dot(QS.M1.dot(QS.opt_linear)) + return target_query_Interactspec(QS, + regress_target_score, + cov_target) - return U1, U2, U3, U4, U5 diff --git a/selectinf/randomized/slope.py b/selectinf/randomized/slope.py index 3015c8259..c8c53b9bf 100644 --- a/selectinf/randomized/slope.py +++ b/selectinf/randomized/slope.py @@ -331,20 +331,25 @@ def _setup_implied_gaussian(self, M1 = prod_score_prec_unnorm * dispersion M2 = M1.dot(cov_rand).dot(M1.T) - M3 = M1.dot(opt_linear.dot(cond_cov).dot(opt_linear.T)).dot(M1.T) + M4 = M1.dot(opt_linear) + M3 = M4.dot(cond_cov).dot(M4.T) # would be nice to not store these? self.M1 = M1 self.M2 = M2 self.M3 = M3 + self.M4 = M4 + self.M5 = M1.dot(self.observed_score_state + observed_subgrad) return (cond_mean, cond_cov, cond_precision, M1, M2, - M3) + M3, + self.M4, + self.M5) def _solve_randomized_problem(self, # optional binary vector From a425869b1cf301ece6731c991b6777db4a64e1a8 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 13 Dec 2022 16:22:32 -0800 Subject: [PATCH 178/187] bootstrap lasso version --- selectinf/randomized/approx_reference.py | 3 +- .../randomized/approx_reference_grouplasso.py | 4 +- selectinf/randomized/exact_reference.py | 4 +- selectinf/randomized/lasso.py | 83 +++++++++++++++++++ selectinf/randomized/query.py | 6 +- 5 files changed, 92 insertions(+), 8 deletions(-) diff --git a/selectinf/randomized/approx_reference.py b/selectinf/randomized/approx_reference.py index 6feaca6db..81f907e13 100644 --- a/selectinf/randomized/approx_reference.py +++ b/selectinf/randomized/approx_reference.py @@ -343,7 +343,8 @@ def _approx_log_reference(self, # cond_mean is "something" times D # Gamma is cov_target_score.T.dot(prec_target) - cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + QS.cond_mean) + cond_mean_grid = (linear_coef.dot(np.atleast_1d(grid[k] - observed_target)) + + QS.cond_mean) conjugate_arg = cond_precision.dot(cond_mean_grid) val, _, _ = solver(conjugate_arg, diff --git a/selectinf/randomized/approx_reference_grouplasso.py b/selectinf/randomized/approx_reference_grouplasso.py index 5d90e981b..acd9bf811 100644 --- a/selectinf/randomized/approx_reference_grouplasso.py +++ b/selectinf/randomized/approx_reference_grouplasso.py @@ -546,8 +546,8 @@ def log_reference(self, eta = self.prec_opt.dot(self.regress_opt.dot(cov_target_score.T)) - implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) - implied_cov = np.asscalar(eta.T.dot(self.cond_cov).dot(eta)) + implied_mean = (eta.T.dot(cond_mean_grid)).item() + implied_cov = (eta.T.dot(self.cond_cov).dot(eta)).item() implied_prec = 1./implied_cov _A = self.cond_cov.dot(eta) * implied_prec diff --git a/selectinf/randomized/exact_reference.py b/selectinf/randomized/exact_reference.py index ebc8cbd26..209c40c97 100644 --- a/selectinf/randomized/exact_reference.py +++ b/selectinf/randomized/exact_reference.py @@ -40,8 +40,8 @@ def log_reference(self, eta = cond_precision.dot(linear_coef).dot(cov_target) - implied_mean = np.asscalar(eta.T.dot(cond_mean_grid)) - implied_cov = np.asscalar(eta.T.dot(QS.cond_cov).dot(eta)) + implied_mean = (eta.T.dot(cond_mean_grid)).item() + implied_cov = (eta.T.dot(QS.cond_cov).dot(eta)).item() implied_prec = 1./implied_cov _A = QS.cond_cov.dot(eta) * implied_prec diff --git a/selectinf/randomized/lasso.py b/selectinf/randomized/lasso.py index e37023a9b..0c87a0524 100644 --- a/selectinf/randomized/lasso.py +++ b/selectinf/randomized/lasso.py @@ -264,6 +264,89 @@ def _solve_randomized_problem(self, return observed_soln, observed_subgrad + @staticmethod + def fromsample(samples, + feature_weights, + proportion_select=0.5, + estimator=None, + covariance=None): + r""" + Squared-error LASSO with feature weights. + Objective function is (before randomization) + + .. math:: + + \beta \mapsto \frac{1}{2} (\beta-\hat{\beta})'\hat{\Sigma}^{-1}(\beta-\hat{\beta}) + \sum_{i=1}^p \lambda_i |\beta_i| + + where $\lambda$ is `feature_weights`, $\hat{\beta}$` is the row mean + of `samples` and $\hat{\Sigma}$ is its sample covariance. + + Parameters + ---------- + + samples : ndarray + Shape (B,p) -- the sample data matrix (e.g. bootstrap samples) + + feature_weights: [float, sequence] + Penalty weights. An intercept, or other unpenalized + features are handled by setting those entries of + `feature_weights` to 0. If `feature_weights` is + a float, then all parameters are penalized equally. + + quadratic : `regreg.identity_quadratic.identity_quadratic` (optional) + An optional quadratic term to be added to the objective. + Can also be a linear term by setting quadratic + coefficient to 0. + + ridge_term : float + How big a ridge term to add? + + randomizer_scale : float + Scale for IID components of randomizer. + + Returns + ------- + + L : `selection.randomized.lasso.lasso` + + """ + + samples = np.asarray(samples) + B, p = samples.shape + + if estimator is None: + estimator = samples.mean(0) + if covariance is None: + covariance = np.cov(samples.T) + + U, D, V = np.linalg.svd(covariance) + + sqrt_prec = U / np.sqrt(D)[None,:] + sqrt_prec = sqrt_prec.dot(U.T) + prec = sqrt_prec.dot(sqrt_prec.T) + np.testing.assert_allclose(prec, np.linalg.inv(covariance)) + Y = prec.dot(estimator) + + loglike = rr.glm.gaussian(sqrt_prec, + Y, + coef=1., + quadratic=None) + + # proportion should be used somewhere here... + + multiplier = 1 / proportion_select - 1 + randomizer = randomization.gaussian(prec * multiplier) + + idx = np.random.choice(B, 1)[0] + perturb = (samples[idx] - estimator) * np.sqrt(multiplier) + return (lasso(loglike, + np.asarray(feature_weights), + 0, + randomizer, + perturb=perturb), + perturb) + + @staticmethod def gaussian(X, Y, diff --git a/selectinf/randomized/query.py b/selectinf/randomized/query.py index 32c86d0a0..d9a2a83cb 100644 --- a/selectinf/randomized/query.py +++ b/selectinf/randomized/query.py @@ -16,7 +16,7 @@ class QuerySpec(NamedTuple): # how S enters into E[o|S,u] - opt_linear : np.ndarray + opt_linear : np.ndarray # not sure if needed -- absorbed into M4,M5? # constraints @@ -139,8 +139,8 @@ def _setup_sampler(self, (cond_mean, cond_cov, cond_precision) = self._setup_implied_gaussian(opt_linear, - observed_subgrad, - dispersion=dispersion)[:3] + observed_subgrad, + dispersion=dispersion)[:3] self.cond_mean, self.cond_cov = cond_mean, cond_cov From 74369baf55e63ae62826308b2530bc9f4a8d7238 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:30:22 -0700 Subject: [PATCH 179/187] updating docs --- .readthedocs.yml | 7 +++++-- doc-requirements.txt => doc/requirements.txt | 7 ++++--- doc/source/conf.py | 21 +++++++++++++++----- 3 files changed, 25 insertions(+), 10 deletions(-) rename doc-requirements.txt => doc/requirements.txt (72%) diff --git a/.readthedocs.yml b/.readthedocs.yml index bbfd45f45..8418b6d1e 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -18,12 +18,15 @@ sphinx: #formats: all # Optionally set the version of Python and requirements required to build your docs + python: - version: 3.6 + version: 3.9 install: - requirements: requirements.txt - - requirements: doc-requirements.txt + - requirements: doc/requirements.txt - method: setuptools path: . +submodules: + include: all \ No newline at end of file diff --git a/doc-requirements.txt b/doc/requirements.txt similarity index 72% rename from doc-requirements.txt rename to doc/requirements.txt index ab7ed399c..9833fbac5 100644 --- a/doc-requirements.txt +++ b/doc/requirements.txt @@ -1,6 +1,6 @@ # Requirements for building docs # Check these dependencies against doc/conf.py --r dev-requirements.txt +-r ../dev-requirements.txt sphinx>=1.4 numpydoc matplotlib @@ -8,8 +8,9 @@ texext nb2plots seaborn statsmodels -tensorflow +#tensorflow keras nbsphinx jupytext -sphinx_rtd_theme +sphinx-book-theme +myst_nb diff --git a/doc/source/conf.py b/doc/source/conf.py index addf6895c..5ab98cea8 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -44,7 +44,7 @@ 'sphinx_rtd_theme', 'texext.math_dollar', 'numpydoc', - 'nbsphinx' + 'myst_nb' ] # Current version (as of 11/2010) of numpydoc is only compatible with sphinx > @@ -118,16 +118,27 @@ # must exist either in Sphinx' static/ path, or in one of the custom paths # given in html_static_path. +# -- Options for HTML output + +html_theme = "sphinx_book_theme" html_theme_options = { - 'logo_only': True + "repository_url": "https://github.com/jonathan-taylor/selectinf.git", + "use_repository_button": True, +} +html_title = "Introduction to Statistical Learning (Python)" +html_logo = "logo.png" + +source_suffix = { + '.rst': 'restructuredtext', + '.ipynb': 'myst-nb', + '.myst': 'myst-nb', } -html_theme_path = ["../.."] -html_logo = "_static/logo.png" + html_show_sourcelink = True # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -html_title = 'Selection Documentation' +html_title = 'Selection Inference Documentation' # The name of an image file (within the static path) to place at the top of # the sidebar. From 4aee7ad1a06ecbdf9275e0ee776a21537de1b222 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:31:23 -0700 Subject: [PATCH 180/187] update requirements, remove some np.float --- requirements.txt | 1 + selectinf/distributions/discrete_family.py | 6 +++--- selectinf/sampling/truncnorm.pyx | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3ab08e8a6..efe900f71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ regreg # keras # tensorflow traitlets +scikit-learn diff --git a/selectinf/distributions/discrete_family.py b/selectinf/distributions/discrete_family.py index 6bdf10f55..7b96476db 100644 --- a/selectinf/distributions/discrete_family.py +++ b/selectinf/distributions/discrete_family.py @@ -25,7 +25,7 @@ def crit_func(test_statistic, left_cut, right_cut): Parameters ---------- - test_statistic : np.float + test_statistic : float Observed value of test statistic. left_cut : (float, float) @@ -37,7 +37,7 @@ def crit_func(test_statistic, left_cut, right_cut): Returns ------- - decision : np.float + decision : float """ CL, gammaL = left_cut @@ -80,7 +80,7 @@ def __init__(self, sufficient_stat, weights, theta=0.): The weights are normalized to sum to 1. """ - xw = np.array(sorted(zip(sufficient_stat, weights)), np.float) + xw = np.array(sorted(zip(sufficient_stat, weights)), float) self._x = xw[:,0] self._w = xw[:,1] self._lw = np.log(xw[:,1]) diff --git a/selectinf/sampling/truncnorm.pyx b/selectinf/sampling/truncnorm.pyx index a9d415a1e..04cb2bbe8 100644 --- a/selectinf/sampling/truncnorm.pyx +++ b/selectinf/sampling/truncnorm.pyx @@ -15,9 +15,9 @@ This module has a code to sample from a truncated normal distribution specified by a set of affine constraints. """ -DTYPE_float = np.float +DTYPE_float = float ctypedef cnp.float_t DTYPE_float_t -DTYPE_int = np.int +DTYPE_int = int ctypedef cnp.int_t DTYPE_int_t ctypedef cnp.intp_t DTYPE_intp_t From 14f022d5967c7cd57db4222320f07174c345aafc Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:33:00 -0700 Subject: [PATCH 181/187] update python version for readthedocs --- .readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 8418b6d1e..96ad0c8eb 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -20,7 +20,7 @@ sphinx: # Optionally set the version of Python and requirements required to build your docs python: - version: 3.9 + version: 3.8 install: - requirements: requirements.txt - requirements: doc/requirements.txt From 5552303f83aa939f1932d9a8c09ded107eea0a0e Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:36:14 -0700 Subject: [PATCH 182/187] adding regreg as a submodule --- .gitmodules | 3 +++ regreg | 1 + 2 files changed, 4 insertions(+) create mode 160000 regreg diff --git a/.gitmodules b/.gitmodules index 134b4cb57..9f883b6df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,3 +5,6 @@ path = C-software url = https://github.com/selective-inference/C-software.git +[submodule "regreg"] + path = regreg + url = https://github.com/jonathan-taylor/regreg.git diff --git a/regreg b/regreg new file mode 160000 index 000000000..1e411d1c8 --- /dev/null +++ b/regreg @@ -0,0 +1 @@ +Subproject commit 1e411d1c8edfae9d96c7247b19af2b7a7094f345 From 57da65973a31bf02658e9908c0441c5cd0690466 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:36:41 -0700 Subject: [PATCH 183/187] trying to build regreg first --- .readthedocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 96ad0c8eb..bec99bda9 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -22,6 +22,8 @@ sphinx: python: version: 3.8 install: + - method: setuptools + path: regreg - requirements: requirements.txt - requirements: doc/requirements.txt - method: setuptools From f503c5cc0cc867de04d54fcc2e47972480b77168 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:38:06 -0700 Subject: [PATCH 184/187] trying path of regreg --- .readthedocs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index bec99bda9..28ab7b116 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -22,8 +22,8 @@ sphinx: python: version: 3.8 install: - - method: setuptools - path: regreg + - method: pip + path: ./regreg - requirements: requirements.txt - requirements: doc/requirements.txt - method: setuptools From 6d2260dcd452d35c480cda03c08bc7ca5ddd8176 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:39:40 -0700 Subject: [PATCH 185/187] trying again --- .readthedocs.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 28ab7b116..96ad0c8eb 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -22,8 +22,6 @@ sphinx: python: version: 3.8 install: - - method: pip - path: ./regreg - requirements: requirements.txt - requirements: doc/requirements.txt - method: setuptools From 456dcb803197b1dbf8175d6884257401785d16cb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:42:53 -0700 Subject: [PATCH 186/187] using URL in requirements file --- .gitmodules | 3 --- regreg | 1 - requirements.txt | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) delete mode 160000 regreg diff --git a/.gitmodules b/.gitmodules index 9f883b6df..134b4cb57 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,6 +5,3 @@ path = C-software url = https://github.com/selective-inference/C-software.git -[submodule "regreg"] - path = regreg - url = https://github.com/jonathan-taylor/regreg.git diff --git a/regreg b/regreg deleted file mode 160000 index 1e411d1c8..000000000 --- a/regreg +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1e411d1c8edfae9d96c7247b19af2b7a7094f345 diff --git a/requirements.txt b/requirements.txt index efe900f71..c08d325af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ pandas mpmath pyinter sklearn -regreg +git+https://github.com/jonathan-taylor/regreg # keras # tensorflow traitlets From e448111f8241f92e8a67abfe38b6f9b802e83062 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 25 Apr 2023 16:46:17 -0700 Subject: [PATCH 187/187] trying again --- .readthedocs.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 96ad0c8eb..cb5b32965 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -22,6 +22,9 @@ sphinx: python: version: 3.8 install: + - requirements: https://raw.githubusercontent.com/jonathan-taylor/regreg/master/requirements.txt + - method: pip + path: https://github.com/jonathan-taylor/regreg.git - requirements: requirements.txt - requirements: doc/requirements.txt - method: setuptools