sherrym
diff --git a/‎configure
+51-10 b/‎configure
+51-10
diff --git a/‎tensorflow/core/BUILD
+1 b/‎tensorflow/core/BUILD
+1
diff --git a/‎tensorflow/core/client/tensor_c_api.cc
+80-12 b/‎tensorflow/core/client/tensor_c_api.cc
+80-12
diff --git a/‎tensorflow/core/common_runtime/copy_tensor.cc
+48-49 b/‎tensorflow/core/common_runtime/copy_tensor.cc
+48-49
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+if [ "$TF_UNOFFICIAL_SETTING" == "1" ]; then
+  echo -e "\nWARNING: You are configuring unofficial settings in TensorFlow. Because some external libraries are not backward compatible, these settings are largely untested and unsupported. \n" 1>&2
+fi
+
 ## Set up python-related environment settings
 while true; do
   fromuser=""
@@ -44,32 +48,55 @@ fi
 
 # Find out where the CUDA toolkit is installed
 while true; do
+  # Configure the Cuda SDK version to use.
+  default_cuda_version="7.0"
+  if [ "$TF_UNOFFICIAL_SETTING" == "1" ]; then
+    if [ -z "$TF_CUDA_VERSION" ]; then
+      read -p "Please specify the Cuda SDK version you want to use. [Default is $default_cuda_version]: " TF_CUDA_VERSION
+    fi
+  fi
+  if [ -z "$TF_CUDA_VERSION" ]; then
+    TF_CUDA_VERSION=$default_cuda_version
+  fi
+
   fromuser=""
   if [ -z "$CUDA_TOOLKIT_PATH" ]; then
     default_cuda_path=/usr/local/cuda
-    read -p "Please specify the location where CUDA 7.0 toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
+    read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
     fromuser="1"
     if [ -z "$CUDA_TOOLKIT_PATH" ]; then
       CUDA_TOOLKIT_PATH=$default_cuda_path
     fi
   fi
-  if [ -e "$CUDA_TOOLKIT_PATH/lib64/libcudart.so.7.0" ]; then
+  if [ -e "$CUDA_TOOLKIT_PATH/lib64/libcudart.so.$TF_CUDA_VERSION" ]; then
     break
   fi
-  echo "Invalid path to CUDA 7.0 toolkit. ${CUDA_TOOLKIT_PATH}/lib64/libcudart.so.7.0 cannot be found"
+  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/lib64/libcudart.so.$TF_CUDA_VERSION cannot be found"
   if [ -z "$fromuser" ]; then
     exit 1
   fi
+  TF_CUDA_VERSION=""
   CUDA_TOOLKIT_PATH=""
   # Retry
 done
 
 # Find out where the cuDNN library is installed
 while true; do
+  # Configure the Cudnn version to use.
+  default_cudnn_version="6.5"
+  if [ "$TF_UNOFFICIAL_SETTING" == "1" ]; then
+    if [ -z "$TF_CUDNN_VERSION" ]; then
+      read -p "Please specify the Cudnn version you want to use. [Default is $default_cudnn_version]: " TF_CUDNN_VERSION
+    fi
+  fi
+  if [ -z "$TF_CUDNN_VERSION" ]; then
+    TF_CUDNN_VERSION=$default_cudnn_version
+  fi
+
   fromuser=""
   if [ -z "$CUDNN_INSTALL_PATH" ]; then
     default_cudnn_path=${CUDA_TOOLKIT_PATH}
-    read -p "Please specify the location where cuDNN v2 library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
+    read -p "Please specify the location where cuDNN $TF_CUDNN_VERSION library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
     fromuser="1"
     if [ -z "$CUDNN_INSTALL_PATH" ]; then
       CUDNN_INSTALL_PATH=$default_cudnn_path
@@ -78,32 +105,46 @@ while true; do
     # Going through one more level of expansion to handle that.
     CUDNN_INSTALL_PATH=$(bash -c "readlink -f $CUDNN_INSTALL_PATH")
   fi
-  if [ -e "$CUDNN_INSTALL_PATH/libcudnn.so.6.5" -o -e "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.6.5" ]; then
+  if [ -e "$CUDNN_INSTALL_PATH/libcudnn.so.${TF_CUDNN_VERSION}" -o -e "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.${TF_CUDNN_VERSION}" ]; then
     break
   fi
-  echo "Invalid path to cuDNN v2 toolkit. Neither of the following two files can be found:"
-  echo "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.6.5"
-  echo "$CUDNN_INSTALL_PATH/libcudnn.so.6.5"
+  echo "Invalid path to cuDNN ${TF_CUDNN_VERSION} toolkit. Neither of the following two files can be found:"
+  echo "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.${TF_CUDNN_VERSION}"
+  echo "$CUDNN_INSTALL_PATH/libcudnn.so.${TF_CUDNN_VERSION}"
   if [ -z "$fromuser" ]; then
     exit 1
   fi
+  TF_CUDNN_VERSION=""
   CUDNN_INSTALL_PATH=""
   # Retry
 done
 
 cat > third_party/gpus/cuda/cuda.config <<EOF
-# CUDA_TOOLKIT_PATH refers to the CUDA toolkit. Tensorflow requires Cuda 7.0
+# CUDA_TOOLKIT_PATH refers to the CUDA toolkit. Tensorflow requires Cuda $TF_CUDA_VERSION
 # at the moment.
 CUDA_TOOLKIT_PATH="$CUDA_TOOLKIT_PATH"
 
 # CUDNN_INSTALL_PATH refers to the cuDNN toolkit. The cuDNN header and library
 # files can be either in this directory, or under include/ and lib64/
 # directories separately.
 CUDNN_INSTALL_PATH="$CUDNN_INSTALL_PATH"
+
+# The Cuda SDK version that should be used in this build
+TF_CUDA_VERSION=$TF_CUDA_VERSION
+
+# The Cudnn version that should be used in this build
+TF_CUDNN_VERSION=$TF_CUDNN_VERSION
+
 EOF
 
 function UnofficialSetting() {
-  echo -e "\nWARNING: You are configuring unofficial settings in TensorFlow. Because some external libraries are not backward compatible, these settings are largely untested and unsupported. \n" 1>&2
+  # Configure the Cuda toolkit version to work with.
+  perl -pi -e "s,CUDA_VERSION = '[0-9\.]*',CUDA_VERSION = '$TF_CUDA_VERSION',s" tensorflow/core/platform/default/build_config.bzl
+  perl -pi -e "s,(GetCudaVersion.*return )\"[0-9\.]*\",\1\"$TF_CUDA_VERSION\",s" tensorflow/stream_executor/dso_loader.cc
+
+  # Configure the Cudnn version to work with.
+  perl -pi -e "s,CUDNN_VERSION = '[0-9\.]*',CUDNN_VERSION = '$TF_CUDNN_VERSION',s" tensorflow/core/platform/default/build_config.bzl
+  perl -pi -e "s,(GetCudnnVersion.*return )\"[0-9\.]*\",\1\"$TF_CUDNN_VERSION\",s" tensorflow/stream_executor/dso_loader.cc
 
   # Configure the compute capabilities that TensorFlow builds for.
   # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
 
@@ -298,6 +298,7 @@ tf_cuda_library(
         "graph/graph_constructor.h",
         "graph/graph_def_builder.h",
         "graph/node_builder.h",
+        "graph/validate.h",
         "public/session.h",
         "public/session_options.h",
         "public/tensor_c_api.h",
 
@@ -316,16 +316,16 @@ Status LoadLibrary(const char* library_filename, void** result,
 
 }  // namespace tensorflow
 
-extern "C" {
-
-void TF_Run(TF_Session* s,
-            // Input tensors
-            const char** c_input_names, TF_Tensor** c_inputs, int ninputs,
-            // Output tensors
-            const char** c_output_tensor_names, TF_Tensor** c_outputs,
-            int noutputs,
-            // Target nodes
-            const char** c_target_node_names, int ntargets, TF_Status* status) {
+void TF_Run_Helper(TF_Session* s, const char* handle,
+                   // Input tensors
+                   const char** c_input_names, TF_Tensor** c_inputs,
+                   int ninputs,
+                   // Output tensors
+                   const char** c_output_tensor_names, TF_Tensor** c_outputs,
+                   int noutputs,
+                   // Target nodes
+                   const char** c_target_node_names, int ntargets,
+                   TF_Status* status) {
   status->status = Status::OK();
   for (int i = 0; i < noutputs; i++) {
     c_outputs[i] = NULL;
@@ -365,8 +365,13 @@ void TF_Run(TF_Session* s,
   for (int i = 0; i < ntargets; i++) {
     target_node_names[i] = c_target_node_names[i];
   }
-  Status result =
-      s->session->Run(inputs, output_tensor_names, target_node_names, &outputs);
+  Status result;
+  if (handle == nullptr) {
+    result = s->session->Run(inputs, output_tensor_names, target_node_names,
+                             &outputs);
+  } else {
+    result = s->session->PRun(handle, inputs, output_tensor_names, &outputs);
+  }
   if (!result.ok()) {
     status->status = result;
     return;
@@ -392,6 +397,69 @@ void TF_Run(TF_Session* s,
   }
 }
 
+extern "C" {
+
+void TF_Run(TF_Session* s,
+            // Input tensors
+            const char** c_input_names, TF_Tensor** c_inputs, int ninputs,
+            // Output tensors
+            const char** c_output_tensor_names, TF_Tensor** c_outputs,
+            int noutputs,
+            // Target nodes
+            const char** c_target_node_names, int ntargets, TF_Status* status) {
+  TF_Run_Helper(s, nullptr, c_input_names, c_inputs, ninputs,
+                c_output_tensor_names, c_outputs, noutputs, c_target_node_names,
+                ntargets, status);
+}
+
+void TF_PRunSetup(TF_Session* s,
+                  // Input names
+                  const char** c_input_names, int ninputs,
+                  // Output names
+                  const char** c_output_tensor_names, int noutputs,
+                  // Target nodes
+                  const char** c_target_node_names, int ntargets, char** handle,
+                  TF_Status* status) {
+  status->status = Status::OK();
+
+  std::vector<tensorflow::string> input_names(ninputs);
+  std::vector<tensorflow::string> output_tensor_names(noutputs);
+  std::vector<tensorflow::string> target_node_names(ntargets);
+  for (int i = 0; i < ninputs; i++) {
+    input_names[i] = c_input_names[i];
+  }
+  for (int i = 0; i < noutputs; i++) {
+    output_tensor_names[i] = c_output_tensor_names[i];
+  }
+  for (int i = 0; i < ntargets; i++) {
+    target_node_names[i] = c_target_node_names[i];
+  }
+  tensorflow::string new_handle;
+  Status result;
+  result = s->session->PRunSetup(input_names, output_tensor_names,
+                                 target_node_names, &new_handle);
+  if (result.ok()) {
+    *handle = new char[new_handle.size() + 1];
+    memcpy(*handle, new_handle.c_str(), new_handle.size() + 1);
+  } else {
+    status->status = result;
+  }
+}
+
+void TF_PRun(TF_Session* s, const char* handle,
+             // Input tensors
+             const char** c_input_names, TF_Tensor** c_inputs, int ninputs,
+             // Output tensors
+             const char** c_output_tensor_names, TF_Tensor** c_outputs,
+             int noutputs,
+             // Target nodes
+             const char** c_target_node_names, int ntargets,
+             TF_Status* status) {
+  TF_Run_Helper(s, handle, c_input_names, c_inputs, ninputs,
+                c_output_tensor_names, c_outputs, noutputs, c_target_node_names,
+                ntargets, status);
+}
+
 const void* TF_BufferData(TF_Buffer* buffer) { return buffer->data; }
 
 size_t TF_BufferLength(TF_Buffer* buffer) { return buffer->length; }
 
@@ -53,58 +53,57 @@ void CopyTensor::ViaDMA(const string& edge_name,
                         StatusCallback done) {
   initialization_done = true;
   port::Tracing::ScopedAnnotation annotation(edge_name);
-  VLOG(1) << "CopyViaDMA " << edge_name;
-  const size_t total_bytes = input->TotalBytes();
-
-  // Note that 0-size tensors have no backing buffer.
-  if (total_bytes > 0) {
-    const DeviceType src_device_type(src_alloc_attr.on_host()
-                                         ? DEVICE_CPU
-                                         : src->attributes().device_type());
-    const DeviceType dst_device_type(dst_alloc_attr.on_host()
-                                         ? DEVICE_CPU
-                                         : dst->attributes().device_type());
-    const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
-    const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
-
-    if (non_cpu_src) {
-      if (non_cpu_dst) {
-        // Device to device copy.  Look through registry for an appropriate
-        // CopyFunction.
-        std::vector<RegistrationInfo>* registry = MutableRegistry();
-        for (const RegistrationInfo& ri : *registry) {
-          if (ri.sender_device_type == src_device_type &&
-              ri.receiver_device_type == dst_device_type) {
-            ri.copy_function(send_dev_context, recv_dev_context, src, dst,
-                             src_alloc_attr, dst_alloc_attr, input, output,
-                             done);
-            return;
-          }
-        }
-
-        // TODO(josh11b): If no CopyFunction is found, we currently fail
-        // but we could copy between devices via CPU.
-        done(errors::Unimplemented(
-            "No function registered to copy from devices of type ",
-            src_device_type.type(), " to devices of type ",
-            dst_device_type.type()));
-      } else {
-        // Device to host copy.
-        return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
-                                                       output, done);
+  VLOG(1) << "Copy " << edge_name;
+
+  const DeviceType src_device_type(
+      src_alloc_attr.on_host() ? DEVICE_CPU : src->attributes().device_type());
+  const DeviceType dst_device_type(
+      dst_alloc_attr.on_host() ? DEVICE_CPU : dst->attributes().device_type());
+  const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+  const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+
+  // E.g., gpu -> gpu
+  if (non_cpu_src && non_cpu_dst) {
+    // Device to device copy.  Look through registry for an appropriate
+    // CopyFunction.
+    std::vector<RegistrationInfo>* registry = MutableRegistry();
+    for (const RegistrationInfo& ri : *registry) {
+      if (ri.sender_device_type == src_device_type &&
+          ri.receiver_device_type == dst_device_type) {
+        ri.copy_function(send_dev_context, recv_dev_context, src, dst,
+                         src_alloc_attr, dst_alloc_attr, input, output, done);
+        return;
       }
-    } else if (non_cpu_dst) {
-      // Host to Device copy.
-      // Note that this is already an async copy.
-      recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
-    } else {
-      *output = *input;
-      done(Status::OK());
     }
-  } else {
-    // buffer is empty
-    done(Status::OK());
+
+    // TODO(josh11b): If no CopyFunction is found, we currently fail
+    // but we could copy between devices via CPU.
+    done(errors::Unimplemented(
+        "No function registered to copy from devices of type ",
+        src_device_type.type(), " to devices of type ",
+        dst_device_type.type()));
+    return;
+  }
+
+  // E.g., gpu -> cpu
+  if (non_cpu_src && !non_cpu_dst) {
+    // Device to host copy.
+    send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, output,
+                                            done);
+    return;
   }
+
+  // E.g., cpu -> gpu
+  if (!non_cpu_src && non_cpu_dst) {
+    // Host to Device copy.
+    recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
+    return;
+  }
+
+  // cpu -> cpu
+  CHECK(!non_cpu_src && !non_cpu_dst);
+  *output = *input;
+  done(Status::OK());
 }
 
 // static