diff --git a/Compiling.md b/Compiling.md
index e14d2f5d6..928a88795 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -39,10 +39,10 @@ As also mentioned in the instructions below but repeated here for visibility, if
 
 ## Windows
    * Requirements
-      * CMake with a minimum version of 3.10.2, GUI version strongly recommended (https://cmake.org/download/)
+      * CMake with a minimum version of 3.18.2, GUI version strongly recommended (https://cmake.org/download/)
       * Microsoft Visual Studio for C++. Version 15 (2017) has been tested and should work, other versions might work as well.
       * If using the OpenCL backend, a modern GPU that supports OpenCL 1.2 or greater, or else something like [this](https://software.intel.com/en-us/opencl-sdk) for CPU. But if using CPU, Eigen should be better.
-      * If using the CUDA backend, CUDA 10.2 with CUDNN 7.6.5, or CUDA 11.1 with CUDNN 8.0.4 (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them. I'm unsure how version compatibility works with CUDA, there's a good chance that later versions than these work just as well, but they have not been tested.
+      * If using the CUDA backend, CUDA 10.2 with CUDNN 7.6.5, or CUDA 11.0.2 with CUDNN 8.0.4 (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them. I'm unsure how version compatibility works with CUDA, there's a good chance that later versions than these work just as well, but they have not been tested. (If you do sync selfplay-training on a single machine, you'd better choose CUDA 11.0.2 + CUDNN 8.0.4, for tensorflow_gpu 2.4.0 compatibility.)
       * If using the TensorRT backend, in addition to the dependencies for the CUDA backend, you also need TensorRT (https://developer.nvidia.com/tensorrt) on a version compatible with your CUDA and CUDNN versions.
       * If using the Eigen backend, Eigen3, version 3.3.x. (http://eigen.tuxfamily.org/index.php?title=Main_Page#Download).
       * zlib. The following package might work, https://www.nuget.org/packages/zlib-vc140-static-64/, or alternatively you can build it yourself via something like: https://github.com/kiyolee/zlib-win-build
diff --git a/python/README.md b/python/README.md
index 752e4452e..208bd9b90 100644
--- a/python/README.md
+++ b/python/README.md
@@ -39,4 +39,9 @@ These are a separate set of scripts that have nothing to do with any of the abov
 * `genboard_run.py`
 * `genboard_train.py`
 
+### Dependencies
+- scipy
+- tf-slim
+- python-dateutil
+- requests-toolbelt
 
diff --git a/python/export_model.py b/python/export_model.py
index bf01b8460..96b83f18f 100644
--- a/python/export_model.py
+++ b/python/export_model.py
@@ -89,7 +89,7 @@ def log(s):
   sys.stderr.flush()
 
   if not for_cuda:
-    tf.train.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
+    tf.io.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
     savepath = export_dir + "/" + filename_prefix
     saver.save(session, savepath + ".weights")
     with open(savepath + ".config.json","w") as f:
diff --git a/python/model.py b/python/model.py
index e9b2a092d..350628725 100644
--- a/python/model.py
+++ b/python/model.py
@@ -534,10 +534,10 @@ def batchnorm_and_mask(self,name,tensor,mask,mask_sum,use_gamma_in_fixup=False):
 
     #This is the mean, computed only over exactly the areas of the mask, weighting each spot equally,
     #even across different elements in the batch that might have different board sizes.
-    mean = tf.reduce_sum(tensor * mask,axis=[0,1,2]) / mask_sum
+    mean = tf.reduce_sum(input_tensor=tensor * mask,axis=[0,1,2]) / mask_sum
     zmtensor = tensor-mean
     #Similarly, the variance computed exactly only over those spots
-    var = tf.reduce_sum(tf.square(zmtensor * mask),axis=[0,1,2]) / mask_sum
+    var = tf.reduce_sum(input_tensor=tf.square(zmtensor * mask),axis=[0,1,2]) / mask_sum
 
     with tf.compat.v1.variable_scope(name):
       mean_op = tf.keras.backend.moving_average_update(moving_mean,mean,0.998)
@@ -551,7 +551,7 @@ def training_f():
     def inference_f():
       return (moving_mean,moving_var)
 
-    use_mean,use_var = tf.cond(self.is_training_tensor,training_f,inference_f)
+    use_mean,use_var = tf.cond(pred=self.is_training_tensor,true_fn=training_f,false_fn=inference_f)
     return tf.nn.batch_normalization(tensor,use_mean,use_var,beta,None,epsilon) * mask
 
   # def batchnorm(self,name,tensor):
@@ -605,7 +605,7 @@ def weight_variable(self, name, shape, num_inputs, num_outputs, scale_initial_we
     return variable
 
   def conv2d(self, x, w):
-    return tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='SAME')
+    return tf.nn.conv2d(input=x, filters=w, strides=[1,1,1,1], padding='SAME')
 
   def dilated_conv2d(self, x, w, dilation):
     return tf.nn.atrous_conv2d(x, w, rate = dilation, padding='SAME')
@@ -617,31 +617,31 @@ def apply_symmetry(self,tensor,symmetries,inverse):
 
     if not inverse:
       tensor = tf.cond(
-        ud,
-        lambda: tf.reverse(tensor,[1]),
-        lambda: tensor
+        pred=ud,
+        true_fn=lambda: tf.reverse(tensor,[1]),
+        false_fn=lambda: tensor
       )
       tensor = tf.cond(
-        lr,
-        lambda: tf.reverse(tensor,[2]),
-        lambda: tensor
+        pred=lr,
+        true_fn=lambda: tf.reverse(tensor,[2]),
+        false_fn=lambda: tensor
       )
 
     tensor = tf.cond(
-      transp,
-      lambda: tf.transpose(tensor, [0,2,1,3]),
-      lambda: tensor)
+      pred=transp,
+      true_fn=lambda: tf.transpose(a=tensor, perm=[0,2,1,3]),
+      false_fn=lambda: tensor)
 
     if inverse:
       tensor = tf.cond(
-        ud,
-        lambda: tf.reverse(tensor,[1]),
-        lambda: tensor
+        pred=ud,
+        true_fn=lambda: tf.reverse(tensor,[1]),
+        false_fn=lambda: tensor
       )
       tensor = tf.cond(
-        lr,
-        lambda: tf.reverse(tensor,[2]),
-        lambda: tensor
+        pred=lr,
+        true_fn=lambda: tf.reverse(tensor,[2]),
+        false_fn=lambda: tensor
       )
 
     return tensor
@@ -786,8 +786,8 @@ def global_pool(self, in_layer, mask_sum_hw, mask_sum_hw_sqrt):
     div = tf.reshape(mask_sum_hw,[-1,1,1,1])
     div_sqrt = tf.reshape(mask_sum_hw_sqrt,[-1,1,1,1])
 
-    layer_raw_mean = tf.reduce_sum(in_layer,axis=[1,2],keepdims=True) / div
-    layer_raw_max = tf.reduce_max(in_layer,axis=[1,2],keepdims=True)
+    layer_raw_mean = tf.reduce_sum(input_tensor=in_layer,axis=[1,2],keepdims=True) / div
+    layer_raw_max = tf.reduce_max(input_tensor=in_layer,axis=[1,2],keepdims=True)
 
     # 1, (x-14)/10, and (x-14)^2/100 - 0.1 are three orthogonal functions over [9,19], the range of reasonable board sizes.
     # We have the 14 in there since it's the midpoint of that range. The /10 is just sort of arbitrary normalization to keep things on the same scale.
@@ -803,7 +803,7 @@ def value_head_pool(self, in_layer, mask_sum_hw, mask_sum_hw_sqrt):
     div = tf.reshape(mask_sum_hw,[-1,1])
     div_sqrt = tf.reshape(mask_sum_hw_sqrt,[-1,1])
 
-    layer_raw_mean = tf.reduce_sum(in_layer,axis=[1,2],keepdims=False) / div
+    layer_raw_mean = tf.reduce_sum(input_tensor=in_layer,axis=[1,2],keepdims=False) / div
 
     # 1, (x-14)/10, and (x-14)^2/100 - 0.1 are three orthogonal functions over [9,19], the range of reasonable board sizes.
     # We have the 14 in there since it's the midpoint of that range. The /10 and /100 are just sort of arbitrary normalization to keep things on the same scale
@@ -844,6 +844,8 @@ def build_model(self,config,placeholders):
     assert(self.version == 8 or self.version == 10)
 
     #Input layer---------------------------------------------------------------------------------
+    #tf.compat.v1.disable_eager_execution()   # Important, fix for tensorflow 2.4
+    tf.compat.v1.disable_v2_behavior()
     bin_inputs = (placeholders["bin_inputs"] if "bin_inputs" in placeholders else
                   tf.compat.v1.placeholder(tf.float32, [None] + self.bin_input_shape, name="bin_inputs"))
     global_inputs = (placeholders["global_inputs"] if "global_inputs" in placeholders else
@@ -942,7 +944,7 @@ def build_model(self,config,placeholders):
     cur_layer = tf.reshape(cur_layer,[-1,self.pos_len,self.pos_len,self.num_bin_input_features])
 
     assert(include_history.shape[1].value == 5)
-    transformed_global_inputs = global_inputs * tf.pad(include_history, [(0,0),(0,self.num_global_input_features - include_history.shape[1].value)], constant_values=1.0)
+    transformed_global_inputs = global_inputs * tf.pad(tensor=include_history, paddings=[(0,0),(0,self.num_global_input_features - include_history.shape[1].value)], constant_values=1.0)
 
     self.transformed_bin_inputs = cur_layer
     self.transformed_global_inputs = transformed_global_inputs
@@ -963,8 +965,8 @@ def build_model(self,config,placeholders):
     self.gpool_num_channels = gpool_num_channels
 
     mask = cur_layer[:,:,:,0:1]
-    mask_sum = tf.reduce_sum(mask) # Global sum
-    mask_sum_hw = tf.reduce_sum(mask,axis=[1,2,3]) # Sum per batch element
+    mask_sum = tf.reduce_sum(input_tensor=mask) # Global sum
+    mask_sum_hw = tf.reduce_sum(input_tensor=mask,axis=[1,2,3]) # Sum per batch element
     mask_sum_hw_sqrt = tf.sqrt(mask_sum_hw)
 
     #Initial convolutional layer-------------------------------------------------------------------------------------
@@ -1138,7 +1140,7 @@ def scaletransform(tensor):
       #tf.where has a bug where nan values on the non-chosen side will still propagate nans back in gradients.
       #So we also abs the tensor, so that we never get a log of a negative value
       abstensor = tf.abs(tensor)
-      return tf.where(tensor > 0, 1.0 + tf.math.log(abstensor + 1.0), 1.0 / (1.0 + tf.math.log(abstensor + 1.0)))
+      return tf.compat.v1.where(tensor > 0, 1.0 + tf.math.log(abstensor + 1.0), 1.0 / (1.0 + tf.math.log(abstensor + 1.0)))
 
     scorebelief_len = self.scorebelief_target_shape[0]
     scorebelief_mid = self.pos_len*self.pos_len+Model.EXTRA_SCORE_DISTR_RADIUS
@@ -1240,7 +1242,7 @@ def scaletransform(tensor):
 
 def huber_loss(x,y,delta):
   absdiff = tf.abs(x - y)
-  return tf.where(absdiff > delta, (0.5 * delta*delta) + delta * (absdiff - delta), 0.5 * absdiff * absdiff)
+  return tf.compat.v1.where(absdiff > delta, (0.5 * delta*delta) + delta * (absdiff - delta), 0.5 * absdiff * absdiff)
 
 
 class Target_vars:
@@ -1268,6 +1270,8 @@ def __init__(self,model,for_optimization,placeholders):
     shortterm_value_error_prediction = tf.math.softplus(moremiscvalues_output[:,0]) * 0.25
     shortterm_score_error_prediction = tf.math.softplus(moremiscvalues_output[:,1]) * 30.0
 
+    tf.compat.v1.disable_v2_behavior()
+
     #Loss function
     self.policy_target = (placeholders["policy_target"] if "policy_target" in placeholders else
                           tf.compat.v1.placeholder(tf.float32, [None] + model.policy_target_shape))
@@ -1347,42 +1351,42 @@ def __init__(self,model,for_optimization,placeholders):
 
 
     self.policy_loss_unreduced = self.policy_target_weight * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.policy_target, logits=policy_output[:,:,0])
+      tf.nn.softmax_cross_entropy_with_logits(labels=self.policy_target, logits=policy_output[:,:,0])
     )
     self.policy1_loss_unreduced = self.policy_target_weight1 * 0.15 * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.policy_target1, logits=policy_output[:,:,1])
+      tf.nn.softmax_cross_entropy_with_logits(labels=self.policy_target1, logits=policy_output[:,:,1])
     )
 
-    self.value_loss_unreduced = 1.20 * tf.nn.softmax_cross_entropy_with_logits_v2(
+    self.value_loss_unreduced = 1.20 * tf.nn.softmax_cross_entropy_with_logits(
       labels=self.value_target,
       logits=value_output
     )
 
     self.td_value_loss_unreduced = tf.constant([0.55,0.55,0.15],dtype=tf.float32) * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.td_value_target,
         logits=td_value_prediction
       ) -
       # Subtract out the entropy, so as to get loss 0 at perfect prediction
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.td_value_target,
         logits=tf.math.log(self.td_value_target + 1.0e-30)
       )
     )
-    self.td_value_loss_unreduced = tf.reduce_sum(self.td_value_loss_unreduced, axis=1)
+    self.td_value_loss_unreduced = tf.reduce_sum(input_tensor=self.td_value_loss_unreduced, axis=1)
 
     self.td_score_loss_unreduced = 0.0004 * self.ownership_target_weight * (
-      tf.reduce_sum(huber_loss(self.td_score_target, td_score_prediction, delta = 12.0), axis=1)
+      tf.reduce_sum(input_tensor=huber_loss(self.td_score_target, td_score_prediction, delta = 12.0), axis=1)
     )
 
     self.scorebelief_cdf_loss_unreduced = 0.020 * self.ownership_target_weight * (
       tf.reduce_sum(
-        tf.square(tf.cumsum(self.scorebelief_target,axis=1) - tf.cumsum(tf.nn.softmax(scorebelief_output,axis=1),axis=1)),
+        input_tensor=tf.square(tf.cumsum(self.scorebelief_target,axis=1) - tf.cumsum(tf.nn.softmax(scorebelief_output,axis=1),axis=1)),
         axis=1
       )
     )
     self.scorebelief_pdf_loss_unreduced = 0.020 * self.ownership_target_weight * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.scorebelief_target,
         logits=scorebelief_output
       )
@@ -1393,7 +1397,7 @@ def __init__(self,model,for_optimization,placeholders):
     #Not unlike the way that policy and value loss are also equal-weighted by batch element.
     self.ownership_loss_unreduced = 1.5 * self.ownership_target_weight * (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([(1+self.ownership_target)/2,(1-self.ownership_target)/2],axis=3),
           logits=tf.stack([ownership_output,-ownership_output],axis=3)
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1403,7 +1407,7 @@ def __init__(self,model,for_optimization,placeholders):
 
     self.scoring_loss_unreduced = 1.0 * self.scoring_target_weight * (
       tf.reduce_sum(
-        tf.square(self.scoring_target - scoring_output) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
+        input_tensor=tf.square(self.scoring_target - scoring_output) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
         axis=[1,2]
       ) / model.mask_sum_hw
     )
@@ -1421,7 +1425,7 @@ def __init__(self,model,for_optimization,placeholders):
     #due to simply being farther in the future, so multiply by [1,0.25].
     self.futurepos_loss_unreduced = 0.25 * self.futurepos_target_weight * (
       tf.reduce_sum(
-        tf.square(tf.tanh(futurepos_output) - self.futurepos_target)
+        input_tensor=tf.square(tf.tanh(futurepos_output) - self.futurepos_target)
         * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len,1])
         * tf.reshape(tf.constant([1,0.25],dtype=tf.float32),[1,1,1,2]),
         axis=[1,2,3]
@@ -1432,10 +1436,10 @@ def __init__(self,model,for_optimization,placeholders):
     owned_target = tf.square(self.ownership_target)
     unowned_target = 1.0 - owned_target
     unowned_proportion = (
-      tf.reduce_sum(unowned_target * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2])
-      / (1.0 + tf.reduce_sum(tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2]))
+      tf.reduce_sum(input_tensor=unowned_target * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2])
+      / (1.0 + tf.reduce_sum(input_tensor=tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2]))
     )
-    unowned_proportion = tf.reduce_mean(unowned_proportion * self.ownership_target_weight)
+    unowned_proportion = tf.reduce_mean(input_tensor=unowned_proportion * self.ownership_target_weight)
     if model.is_training:
       moving_unowned_proportion = tf.compat.v1.get_variable(initializer=1.0,name=("moving_unowned_proportion"),trainable=False)
       moving_unowned_op = tf.keras.backend.moving_average_update(moving_unowned_proportion,unowned_proportion,0.998)
@@ -1446,7 +1450,7 @@ def __init__(self,model,for_optimization,placeholders):
 
     self.seki_loss_unreduced = (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([1.0-tf.square(self.seki_target), tf.nn.relu(self.seki_target), tf.nn.relu(-self.seki_target)],axis=3),
           logits=seki_output[:,:,:,0:3]
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1455,7 +1459,7 @@ def __init__(self,model,for_optimization,placeholders):
     )
     self.seki_loss_unreduced = self.seki_loss_unreduced + 0.5 * (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([unowned_target, owned_target],axis=3),
           logits=tf.stack([seki_output[:,:,:,3],tf.zeros_like(self.ownership_target)],axis=3)
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1466,7 +1470,7 @@ def __init__(self,model,for_optimization,placeholders):
     self.seki_weight_scale = seki_weight_scale
 
     #This is conditional upon there being a result
-    expected_score_from_belief = tf.reduce_sum(scorebelief_probs * model.score_belief_offset_vector,axis=1)
+    expected_score_from_belief = tf.reduce_sum(input_tensor=scorebelief_probs * model.score_belief_offset_vector,axis=1)
 
     #Huber will incentivize this to not actually converge to the mean, but rather something meanlike locally and something medianlike
     #for very large possible losses. This seems... okay - it might actually be what users want.
@@ -1475,7 +1479,7 @@ def __init__(self,model,for_optimization,placeholders):
     self.variance_time_loss_unreduced = 0.0003 * self.ownership_target_weight * huber_loss(self.variance_time_target, variance_time_prediction, delta = 50.0)
 
     stdev_of_belief = tf.sqrt(0.001 + tf.reduce_sum(
-      scorebelief_probs * tf.square(
+      input_tensor=scorebelief_probs * tf.square(
         tf.reshape(model.score_belief_offset_vector,[1,-1]) - tf.reshape(expected_score_from_belief,[-1,1])
       ),axis=1))
     beliefstdevdiff = stdev_of_belief - scorestdev_prediction
@@ -1521,30 +1525,31 @@ def __init__(self,model,for_optimization,placeholders):
       self.scale_reg_loss_unreduced = tf.reshape(0.0004 * tf.add_n([tf.square(variable) for variable in model.prescale_variables]), [-1])
     #self.scale_reg_loss_unreduced = tf.zeros_like(self.winloss_reg_loss_unreduced)
 
-    self.policy_loss = tf.reduce_sum(self.target_weight_used * self.policy_loss_unreduced, name="losses/policy_loss")
-    self.policy1_loss = tf.reduce_sum(self.target_weight_used * self.policy1_loss_unreduced, name="losses/policy1_loss")
-    self.value_loss = tf.reduce_sum(self.target_weight_used * self.value_loss_unreduced, name="losses/value_loss")
-    self.td_value_loss = tf.reduce_sum(self.target_weight_used * self.td_value_loss_unreduced, name="losses/td_value_loss")
-    self.td_score_loss = tf.reduce_sum(self.target_weight_used * self.td_score_loss_unreduced, name="losses/td_score_loss")
-    self.scoremean_loss = tf.reduce_sum(self.target_weight_used * self.scoremean_loss_unreduced, name="losses/scoremean_loss")
-    self.lead_loss = tf.reduce_sum(self.target_weight_used * self.lead_loss_unreduced, name="losses/lead_loss")
-    self.variance_time_loss = tf.reduce_sum(self.target_weight_used * self.variance_time_loss_unreduced, name="losses/variance_time_loss")
-    self.scorebelief_pdf_loss = tf.reduce_sum(self.target_weight_used * self.scorebelief_pdf_loss_unreduced, name="losses/scorebelief_pdf_loss")
-    self.scorebelief_cdf_loss = tf.reduce_sum(self.target_weight_used * self.scorebelief_cdf_loss_unreduced, name="losses/scorebelief_cdf_loss")
-    self.ownership_loss = tf.reduce_sum(self.target_weight_used * self.ownership_loss_unreduced, name="losses/ownership_loss")
-    self.scoring_loss = tf.reduce_sum(self.target_weight_used * self.scoring_loss_unreduced, name="losses/scoring_loss")
-    self.futurepos_loss = tf.reduce_sum(self.target_weight_used * self.futurepos_loss_unreduced, name="losses/futurepos_loss")
-    self.seki_loss = tf.reduce_sum(self.target_weight_used * self.seki_loss_unreduced, name="losses/seki_loss")
-    self.scorestdev_reg_loss = tf.reduce_sum(self.target_weight_used * self.scorestdev_reg_loss_unreduced, name="losses/scorestdev_reg_loss")
-    self.shortterm_value_error_loss = tf.reduce_sum(self.target_weight_used * self.shortterm_value_error_loss_unreduced, name="losses/sloss")
-    self.shortterm_score_error_loss = tf.reduce_sum(self.target_weight_used * self.shortterm_score_error_loss_unreduced, name="losses/shortterm_score_error_loss")
+    self.policy_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.policy_loss_unreduced, name="losses/policy_loss")
+    self.policy1_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.policy1_loss_unreduced, name="losses/policy1_loss")
+    self.value_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.value_loss_unreduced, name="losses/value_loss")
+    self.td_value_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.td_value_loss_unreduced, name="losses/td_value_loss")
+    self.td_score_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.td_score_loss_unreduced, name="losses/td_score_loss")
+    self.scoremean_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scoremean_loss_unreduced, name="losses/scoremean_loss")
+    self.lead_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.lead_loss_unreduced, name="losses/lead_loss")
+    self.variance_time_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.variance_time_loss_unreduced, name="losses/variance_time_loss")
+    self.scorebelief_pdf_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorebelief_pdf_loss_unreduced, name="losses/scorebelief_pdf_loss")
+    self.scorebelief_cdf_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorebelief_cdf_loss_unreduced, name="losses/scorebelief_cdf_loss")
+    self.ownership_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.ownership_loss_unreduced, name="losses/ownership_loss")
+    self.scoring_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scoring_loss_unreduced, name="losses/scoring_loss")
+    self.futurepos_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.futurepos_loss_unreduced, name="losses/futurepos_loss")
+    self.seki_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.seki_loss_unreduced, name="losses/seki_loss")
+    self.scorestdev_reg_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorestdev_reg_loss_unreduced, name="losses/scorestdev_reg_loss")
+    self.shortterm_value_error_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.shortterm_value_error_loss_unreduced, name="losses/sloss")
+    self.shortterm_score_error_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.shortterm_score_error_loss_unreduced, name="losses/shortterm_score_error_loss")
     # self.winloss_reg_loss = tf.reduce_sum(self.target_weight_used * self.winloss_reg_loss_unreduced, name="losses/winloss_reg_loss")
-    self.scale_reg_loss = tf.reduce_sum(self.target_weight_used * self.scale_reg_loss_unreduced, name="losses/scale_reg_loss")
+    self.scale_reg_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scale_reg_loss_unreduced, name="losses/scale_reg_loss")
 
-    self.weight_sum = tf.reduce_sum(self.target_weight_used, name="losses/weight_sum")
+    self.weight_sum = tf.reduce_sum(input_tensor=self.target_weight_used, name="losses/weight_sum")
 
     if for_optimization:
       #Prior/Regularization
+      tf.compat.v1.disable_v2_behavior()
       self.l2_reg_coeff = (placeholders["l2_reg_coeff"] if "l2_reg_coeff" in placeholders else
                            tf.compat.v1.placeholder(tf.float32))
       self.reg_loss_per_weight = self.l2_reg_coeff * (
@@ -1590,21 +1595,21 @@ def __init__(self,model,for_optimization,placeholders):
 class Metrics:
   def __init__(self,model,target_vars,include_debug_stats):
     #Training results
-    policy_target_idxs = tf.argmax(target_vars.policy_target, 1)
-    self.top1_prediction = tf.equal(tf.argmax(model.policy_output[:,:,0], 1), policy_target_idxs)
-    self.top4_prediction = tf.nn.in_top_k(model.policy_output[:,:,0],policy_target_idxs,4)
+    policy_target_idxs = tf.argmax(input=target_vars.policy_target, axis=1)
+    self.top1_prediction = tf.equal(tf.argmax(input=model.policy_output[:,:,0], axis=1), policy_target_idxs)
+    self.top4_prediction = tf.nn.in_top_k(predictions=model.policy_output[:,:,0],targets=policy_target_idxs,k=4)
     self.accuracy1_unreduced = tf.cast(self.top1_prediction, tf.float32)
     self.accuracy4_unreduced = tf.cast(self.top4_prediction, tf.float32)
-    self.value_entropy_unreduced = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.nn.softmax(model.value_output,axis=1), logits=model.value_output)
+    self.value_entropy_unreduced = tf.nn.softmax_cross_entropy_with_logits(labels=tf.nn.softmax(model.value_output,axis=1), logits=model.value_output)
     self.value_conf_unreduced = 4 * tf.square(tf.nn.sigmoid(model.value_output[:,0] - model.value_output[:,1]) - 0.5)
     self.policy_target_entropy_unreduced = target_vars.policy_target_weight * (
-      -tf.reduce_sum(target_vars.policy_target * tf.math.log(target_vars.policy_target+(1e-20)), axis=1)
+      -tf.reduce_sum(input_tensor=target_vars.policy_target * tf.math.log(target_vars.policy_target+(1e-20)), axis=1)
     )
-    self.accuracy1 = tf.reduce_sum(target_vars.target_weight_used * self.accuracy1_unreduced, name="metrics/accuracy1")
-    self.accuracy4 = tf.reduce_sum(target_vars.target_weight_used * self.accuracy4_unreduced, name="metrics/accuracy4")
-    self.value_entropy = tf.reduce_sum(target_vars.target_weight_used * self.value_entropy_unreduced, name="metrics/value_entropy")
-    self.value_conf = tf.reduce_sum(target_vars.target_weight_used * self.value_conf_unreduced, name="metrics/value_conf")
-    self.policy_target_entropy = tf.reduce_sum(target_vars.target_weight_used * self.policy_target_entropy_unreduced, name="metrics/policy_target_entropy")
+    self.accuracy1 = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.accuracy1_unreduced, name="metrics/accuracy1")
+    self.accuracy4 = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.accuracy4_unreduced, name="metrics/accuracy4")
+    self.value_entropy = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.value_entropy_unreduced, name="metrics/value_entropy")
+    self.value_conf = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.value_conf_unreduced, name="metrics/value_conf")
+    self.policy_target_entropy = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.policy_target_entropy_unreduced, name="metrics/policy_target_entropy")
 
     # self.shortterm_value_error_mean_unreduced = target_vars.shortterm_diff_value
     # self.shortterm_score_error_mean_unreduced = target_vars.shortterm_diff_score
@@ -1616,24 +1621,24 @@ def __init__(self,model,target_vars,include_debug_stats):
     if include_debug_stats:
 
       def reduce_norm(x, axis=None, keepdims=False):
-        return tf.sqrt(tf.reduce_mean(tf.square(x), axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=tf.square(x), axis=axis, keepdims=keepdims))
 
       def reduce_stdev(x, axis=None, keepdims=False):
-        m = tf.reduce_mean(x, axis=axis, keepdims=True)
+        m = tf.reduce_mean(input_tensor=x, axis=axis, keepdims=True)
         devs_squared = tf.square(x - m)
-        return tf.sqrt(tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=devs_squared, axis=axis, keepdims=keepdims))
 
       self.activated_prop_by_layer = dict([
-        (name,tf.reduce_mean(tf.count_nonzero(layer,axis=[1,2])/layer.shape[1].value/layer.shape[2].value, axis=0)) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=tf.math.count_nonzero(layer,axis=[1,2])/layer.shape[1].value/layer.shape[2].value, axis=0)) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_output_by_layer = dict([
-        (name,tf.reduce_mean(layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
       ])
       self.stdev_output_by_layer = dict([
         (name,reduce_stdev(layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_weights_by_var = dict([
-        (v.name,tf.reduce_mean(v)) for v in tf.compat.v1.trainable_variables()
+        (v.name,tf.reduce_mean(input_tensor=v)) for v in tf.compat.v1.trainable_variables()
       ])
       self.norm_weights_by_var = dict([
         (v.name,reduce_norm(v)) for v in tf.compat.v1.trainable_variables()
@@ -1673,7 +1678,7 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     bitmasks = tf.reshape(tf.constant([128,64,32,16,8,4,2,1],dtype=tf.uint8),[1,1,1,8])
     binchw = tf.reshape(tf.bitwise.bitwise_and(tf.expand_dims(binchwp,axis=3),bitmasks),[-1,num_bin_input_features,((pos_len*pos_len+7)//8)*8])
     binchw = binchw[:,:,:pos_len*pos_len]
-    binhwc = tf.cast(tf.transpose(binchw, [0,2,1]),tf.float32)
+    binhwc = tf.cast(tf.transpose(a=binchw, perm=[0,2,1]),tf.float32)
     binhwc = tf.math.minimum(binhwc,tf.constant(1.0))
 
     placeholders["bin_inputs"] = binhwc
@@ -1688,11 +1693,11 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     placeholders["include_history"] = features["gtnc"][:,36:41]
 
     policy_target0 = features["ptncm"][:,0,:]
-    policy_target0 = policy_target0 / tf.reduce_sum(policy_target0,axis=1,keepdims=True)
+    policy_target0 = policy_target0 / tf.reduce_sum(input_tensor=policy_target0,axis=1,keepdims=True)
     placeholders["policy_target"] = policy_target0
     placeholders["policy_target_weight"] = features["gtnc"][:,26]
     policy_target1 = features["ptncm"][:,1,:]
-    policy_target1 = policy_target1 / tf.reduce_sum(policy_target1,axis=1,keepdims=True)
+    policy_target1 = policy_target1 / tf.reduce_sum(input_tensor=policy_target1,axis=1,keepdims=True)
     placeholders["policy_target1"] = policy_target1
     placeholders["policy_target_weight1"] = features["gtnc"][:,28]
 
@@ -1705,7 +1710,7 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     placeholders["scorebelief_target"] = features["sdn"] / 100.0
     placeholders["ownership_target"] = features["vtnchw"][:,0]
     placeholders["scoring_target"] = features["vtnchw"][:,4] / 120.0
-    placeholders["futurepos_target"] = tf.transpose(features["vtnchw"][:,2:4], [0,2,3,1])
+    placeholders["futurepos_target"] = tf.transpose(a=features["vtnchw"][:,2:4], perm=[0,2,3,1])
     placeholders["seki_target"] = features["vtnchw"][:,1]
 
     placeholders["target_weight_from_data"] = features["gtnc"][:,25]
diff --git a/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh b/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh
index a4f598596..d4f8a6a00 100755
--- a/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh
+++ b/python/selfplay/distributed/download_and_upload_and_shuffle_and_export_loop.sh
@@ -32,6 +32,11 @@ shift
 RATING_ONLY="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #We're not really using gating, but the upload script expects them to be where gating would put them
 #and using gating disables the export script from making extraneous selfplay data dirs.
 USEGATING=1
@@ -69,7 +74,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
         cd "$basedir"/scripts
         while true
         do
-            time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
+            time ${PYTHON_BIN} ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
                  -old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
                  -new-summary-file "$basedir"/selfplay.summary.json.tmp
             mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json
diff --git a/python/selfplay/distributed/upload_model_for_selfplay.sh b/python/selfplay/distributed/upload_model_for_selfplay.sh
index b3bbf475a..ac0a823a3 100755
--- a/python/selfplay/distributed/upload_model_for_selfplay.sh
+++ b/python/selfplay/distributed/upload_model_for_selfplay.sh
@@ -23,6 +23,12 @@ shift
 RATING_ONLY="$1"
 shift
 
+
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 mkdir -p "$BASEDIR"/modelstobetested
@@ -84,7 +90,7 @@ function uploadStuff() {
                 do
                     set +e
                     set -x
-                    python3 ./upload_model.py \
+                    ${PYTHON_BIN} ./upload_model.py \
                             -run-name "$RUNNAME" \
                             -model-name "$RUNNAME"-"$NAME" \
                             -model-file "$TMPDST"/"$RUNNAME"-"$NAME".bin.gz \
diff --git a/python/selfplay/export_model_for_selfplay.sh b/python/selfplay/export_model_for_selfplay.sh
index f9e308677..be28ff7c9 100755
--- a/python/selfplay/export_model_for_selfplay.sh
+++ b/python/selfplay/export_model_for_selfplay.sh
@@ -21,6 +21,11 @@ shift
 USEGATING="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 mkdir -p "$BASEDIR"/tfsavedmodels_toexport
@@ -64,7 +69,7 @@ function exportStuff() {
                 mkdir "$TMPDST"
 
                 set -x
-                python3 ./export_model.py \
+                ${PYTHON_BIN} ./export_model.py \
                         -saved-model-dir "$SRC"/saved_model \
                         -export-dir "$TMPDST" \
                         -model-name "$NAMEPREFIX""-""$NAME" \
diff --git a/python/selfplay/shuffle.sh b/python/selfplay/shuffle.sh
index 07eb9e9ec..732ecba99 100755
--- a/python/selfplay/shuffle.sh
+++ b/python/selfplay/shuffle.sh
@@ -23,6 +23,11 @@ shift
 BATCHSIZE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 #------------------------------------------------------------------------------
 
 OUTDIR=$(date "+%Y%m%d-%H%M%S")
@@ -37,7 +42,7 @@ echo "Beginning shuffle at" $(date "+%Y-%m-%d %H:%M:%S")
 
 #set -x
 (
-    time python3 ./shuffle.py \
+    time ${PYTHON_BIN} ./shuffle.py \
          "$BASEDIR"/selfplay/ \
          -expand-window-per-row 0.4 \
          -taper-window-exponent 0.65 \
diff --git a/python/selfplay/shuffle_loop.sh b/python/selfplay/shuffle_loop.sh
index 6f75b0c71..66671fdb9 100755
--- a/python/selfplay/shuffle_loop.sh
+++ b/python/selfplay/shuffle_loop.sh
@@ -20,6 +20,11 @@ shift
 BATCHSIZE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 GITROOTDIR="$(git rev-parse --show-toplevel)"
 
 basedir="$(realpath "$BASEDIRRAW")"
@@ -42,7 +47,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
     while true
     do
         rm -f "$basedir"/selfplay.summary.json.tmp
-        time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
+        time ${PYTHON_BIN} ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
              -old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
              -new-summary-file "$basedir"/selfplay.summary.json.tmp
         mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json
diff --git a/python/selfplay/train.sh b/python/selfplay/train.sh
index 904f690f5..77076cca5 100755
--- a/python/selfplay/train.sh
+++ b/python/selfplay/train.sh
@@ -28,6 +28,11 @@ shift
 EXPORTMODE="$1"
 shift
 
+PYTHON_BIN=python3
+if [ ${OS} == "Windows_NT" ] && [ ! -z "${CONDA_PYTHON_EXE}" ]; then
+  PYTHON_BIN=python
+fi
+
 GITROOTDIR="$(git rev-parse --show-toplevel)"
 
 #------------------------------------------------------------------------------
@@ -65,7 +70,7 @@ else
     exit 1
 fi
 
-time python3 "$GITROOTDIR"/python/train.py \
+time ${PYTHON_BIN} "$GITROOTDIR"/python/train.py \
      -traindir "$BASEDIR"/train/"$TRAININGNAME" \
      -datadir "$BASEDIR"/shuffleddata/current/ \
      -exportdir "$BASEDIR"/"$EXPORT_SUBDIR" \
diff --git a/python/shuffle.py b/python/shuffle.py
index 71119fc86..f8cbf1709 100755
--- a/python/shuffle.py
+++ b/python/shuffle.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python_io import TFRecordOptions,TFRecordCompressionType,TFRecordWriter
+from tensorflow.compat.v1.python_io import TFRecordOptions,TFRecordCompressionType,TFRecordWriter
 
 import tfrecordio
 
diff --git a/python/test.py b/python/test.py
index b01cea900..72a355c00 100644
--- a/python/test.py
+++ b/python/test.py
@@ -65,7 +65,7 @@ def log(s):
   dataset = dataset.flat_map(lambda fname: tf.data.TFRecordDataset(fname,compression_type="ZLIB"))
   parse_input = tfrecordio.make_tf_record_parser(model_config,pos_len,batch_size)
   dataset = dataset.map(parse_input)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   features = iterator.get_next()
 elif using_npz:
   features = tfrecordio.make_raw_input_feature_placeholders(model_config,pos_len,batch_size)
diff --git a/python/tfrecordio.py b/python/tfrecordio.py
index a091c96a9..7728178b2 100644
--- a/python/tfrecordio.py
+++ b/python/tfrecordio.py
@@ -22,6 +22,7 @@ def make_raw_input_feature_placeholders(model_config,pos_len,batch_size):
   num_bin_input_features = Model.get_num_bin_input_features(model_config)
   num_global_input_features = Model.get_num_global_input_features(model_config)
 
+  tf.compat.v1.disable_v2_behavior()
   return {
     "binchwp": tf.compat.v1.placeholder(tf.uint8,[batch_size,num_bin_input_features,(pos_len*pos_len+7)//8]),
     "ginc": tf.compat.v1.placeholder(tf.float32,[batch_size,num_global_input_features]),
@@ -40,8 +41,8 @@ def make_tf_record_parser(model_config,pos_len,batch_size,multi_num_gpus=None):
   raw_input_features = make_raw_input_features(model_config,pos_len,batch_size)
 
   def parse_input(serialized_example):
-    example = tf.io.parse_single_example(serialized_example,raw_input_features)
-    binchwp = tf.decode_raw(example["binchwp"],tf.uint8)
+    example = tf.io.parse_single_example(serialized=serialized_example,features=raw_input_features)
+    binchwp = tf.io.decode_raw(example["binchwp"],tf.uint8)
     ginc = example["ginc"]
     ptncm = example["ptncm"]
     gtnc = example["gtnc"]
diff --git a/python/train.py b/python/train.py
index 3b976f300..ca0e7fd86 100755
--- a/python/train.py
+++ b/python/train.py
@@ -17,6 +17,7 @@
 import numpy as np
 import itertools
 import copy
+import tf_slim
 
 import data
 from board import Board
@@ -151,6 +152,9 @@ def trainlog(s):
     multi_gpu_device_ids.append("/GPU:" + str(int(piece)))
   num_gpus_used = len(multi_gpu_device_ids)
 
+# Fix for tensorflow 2.4: Not creating XLA devices, tf_xla_enable_xla_devices not set
+os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
+
 
 # MODEL ----------------------------------------------------------------
 printed_model_yet = False
@@ -172,6 +176,7 @@ def trainlog(s):
     assign_ops = []
     for variable in itertools.chain(tf.compat.v1.model_variables(), tf.compat.v1.trainable_variables()):
       if variable.name.startswith("swa_model/"):
+        tf.compat.v1.disable_v2_behavior()
         placeholder = tf.compat.v1.placeholder(variable.dtype,variable.shape)
         assign_ops.append(tf.compat.v1.assign(variable,placeholder))
         swa_assign_placeholders[variable.name] = placeholder
@@ -261,7 +266,8 @@ def model_fn(features,labels,mode,params):
       synchronization=tf.VariableSynchronization.ON_READ,
       aggregation=tf.VariableAggregation.SUM
     )
-    wsum_op = tf.assign_add(wsum,target_vars.weight_sum)
+    #wsum_op = tf.assign_add(wsum,target_vars.weight_sum)
+    wsum_op = wsum.assign_add(target_vars.weight_sum)
     eval_metric_ops={
       #"wsum": (wsum.read_value(),wsum_op),
       "p0loss": tf.compat.v1.metrics.mean(target_vars.policy_loss_unreduced, weights=target_vars.target_weight_used),
@@ -300,8 +306,8 @@ def model_fn(features,labels,mode,params):
     printed_model_yet = True
 
     def moving_mean(name,x,weights):
-      sumwx = tf.reduce_sum(x*weights,name="printstats/wx/"+name)
-      sumw = tf.reduce_sum(weights,name="printstats/w/"+name)
+      sumwx = tf.reduce_sum(input_tensor=x*weights,name="printstats/wx/"+name)
+      sumw = tf.reduce_sum(input_tensor=weights,name="printstats/w/"+name)
       moving_wx = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_wx"),trainable=False)
       moving_w = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_w"),trainable=False)
 
@@ -413,7 +419,7 @@ def moving_mean(name,x,weights):
           break
       if checkpoint_path is not None:
         print("Initial weights checkpoint to use found at: " + checkpoint_path)
-        vars_in_checkpoint = tf.contrib.framework.list_variables(checkpoint_path)
+        vars_in_checkpoint = tf_slim.list_variables(checkpoint_path)
         varname_in_checkpoint = {}
         print("Checkpoint contains:")
         for varandshape in vars_in_checkpoint:
diff --git a/python/visualize.py b/python/visualize.py
index 14ac47d4c..1605880ca 100644
--- a/python/visualize.py
+++ b/python/visualize.py
@@ -46,7 +46,7 @@ def log(s):
 
 pos_len = 19 # shouldn't matter, all we're doing is exporting weights that don't depend on this
 if name_scope is not None:
-  with tf.name_scope(name_scope):
+  with tf.compat.v1.name_scope(name_scope):
     model = Model(model_config,pos_len,{})
 else:
   model = Model(model_config,pos_len,{})
@@ -59,7 +59,7 @@ def volume(variable):
   return variable_parameters
 
 total_parameters = 0
-for variable in tf.global_variables():
+for variable in tf.compat.v1.global_variables():
   variable_parameters = volume(variable)
   total_parameters += variable_parameters
   log("Model variable %s, %d parameters" % (variable.name,variable_parameters))
@@ -120,7 +120,7 @@ def run(fetches):
 
   if show_all_weight_magnitudes:
     print("name,sumsq,l2regstrength,meansq,rms")
-    for variable in tf.trainable_variables():
+    for variable in tf.compat.v1.trainable_variables():
       values = np.array(variable.eval())
       sq = np.square(values)
       reg = np.sum(sq) if any(v.name == variable.name for v in model.reg_variables) else 0