diff --git a/benchmark.py b/benchmark.py
index 212da08..4a99c59 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -3,9 +3,14 @@
 
 import argparse
 import math
+import os
+import glob
 import time
 
+
 import torch
+import torch.utils.cpp_extension
+import pkg_resources
 
 TIME_SCALES = {'s': 1, 'ms': 1000, 'us': 1000000}
 
@@ -20,6 +25,7 @@
 parser.add_argument('-d', '--double', action='store_true')
 options = parser.parse_args()
 
+LIB_EXT = torch.utils.cpp_extension.LIB_EXT
 if options.example == 'py':
     from python.lltm import LLTM
 elif options.example == 'cpp':
diff --git a/check.py b/check.py
index 8fad6d1..743ee12 100644
--- a/check.py
+++ b/check.py
@@ -3,11 +3,13 @@
 
 import argparse
 import numpy as np
+import os
+import glob
 import torch
+import torch.utils.cpp_extension
+import pkg_resources
 
 import python.lltm_baseline
-import cpp.lltm
-
 
 def check_equal(first, second, verbose):
     if verbose:
@@ -19,8 +21,7 @@ def check_equal(first, second, verbose):
             print("x = {}".format(x.flatten()))
             print("y = {}".format(y.flatten()))
             print('-' * 80)
-        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i))
-
+        np.testing.assert_allclose(x, y, rtol=2e-6, atol=2e-7, err_msg="Index: {}".format(i))
 
 def zero_grad(variables):
     for variable in variables:
@@ -33,14 +34,16 @@ def get_grads(variables):
 
 def check_forward(variables, with_cuda, verbose):
     baseline_values = python.lltm_baseline.LLTMFunction.apply(*variables)
-    cpp_values = cpp.lltm.LLTMFunction.apply(*variables)
+    cpp_variables = [v.cpu() for v in variables]
+    cpp_values = torch.ops.myops.lltm(*cpp_variables)
 
     print('Forward: Baseline (Python) vs. C++ ... ', end='')
     check_equal(baseline_values, cpp_values, verbose)
     print('Ok')
 
     if with_cuda:
-        cuda_values = cuda.lltm.LLTMFunction.apply(*variables)
+        cuda_variables = [v.cuda() for v in variables]
+        cuda_values = torch.ops.myops.lltm(*cuda_variables)
         print('Forward: Baseline (Python) vs. CUDA ... ', end='')
         check_equal(baseline_values, cuda_values, verbose)
         print('Ok')
@@ -53,7 +56,7 @@ def check_backward(variables, with_cuda, verbose):
 
     zero_grad(variables)
 
-    cpp_values = cpp.lltm.LLTMFunction.apply(*variables)
+    cpp_values = torch.ops.myops.lltm(*variables)
     (cpp_values[0] + cpp_values[1]).sum().backward()
     grad_cpp = get_grads(variables)
 
@@ -63,7 +66,7 @@ def check_backward(variables, with_cuda, verbose):
 
     if with_cuda:
         zero_grad(variables)
-        cuda_values = cuda.lltm.LLTMFunction.apply(*variables)
+        cuda_values = torch.ops.myops.lltm(*variables)
         (cuda_values[0] + cuda_values[1]).sum().backward()
         grad_cuda = get_grads(variables)
 
@@ -81,9 +84,22 @@ def check_backward(variables, with_cuda, verbose):
 parser.add_argument('-v', '--verbose', action='store_true')
 options = parser.parse_args()
 
+LIB_EXT = torch.utils.cpp_extension.LIB_EXT
+cpp_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cpp'), "lltm_cpp.py"))
+cpp_lib_path = glob.glob(os.path.join(cpp_module_path, f"lltm_cpp*{LIB_EXT}"))[0]
+torch.ops.load_library(cpp_lib_path)
+
 if options.cuda:
     import cuda.lltm
     device = torch.device("cuda")
+
+    cuda_module_path = os.path.dirname(
+        pkg_resources.resource_filename(
+            pkg_resources.Requirement.parse('lltm_cuda'), "lltm_cuda.py"))
+    cuda_lib_path = glob.glob(os.path.join(cuda_module_path, f"lltm_cuda*{LIB_EXT}"))[0]
+    torch.ops.load_library(cuda_lib_path)
 else:
     device = torch.device("cpu")
 
@@ -100,6 +116,7 @@ def check_backward(variables, with_cuda, verbose):
 
 variables = [X, W, b, h, C]
 
+
 if 'forward' in options.direction:
     check_forward(variables, options.cuda, options.verbose)
 
diff --git a/cpp/lltm.cpp b/cpp/lltm.cpp
index 9bdfe0c..37f93f9 100644
--- a/cpp/lltm.cpp
+++ b/cpp/lltm.cpp
@@ -26,6 +26,7 @@ std::vector<torch::Tensor> lltm_forward(
     torch::Tensor bias,
     torch::Tensor old_h,
     torch::Tensor old_cell) {
+
   auto X = torch::cat({old_h, input}, /*dim=*/1);
 
   auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1));
@@ -84,7 +85,81 @@ std::vector<torch::Tensor> lltm_backward(
   return {d_old_h, d_input, d_weights, d_bias, d_old_cell};
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &lltm_forward, "LLTM forward");
-  m.def("backward", &lltm_backward, "LLTM backward");
+std::vector<torch::Tensor> lltm_op(torch::Tensor input,
+				   torch::Tensor weights,
+				   torch::Tensor bias,
+				   torch::Tensor old_h,
+				   torch::Tensor old_cell){
+  static auto op = torch::Dispatcher::singleton()
+    .findSchemaOrThrow("myops::lltm", "")
+    .typed<decltype(lltm_op)>();
+  return op.call(input, weights, bias, old_h, old_cell);
+}
+
+std::vector<torch::Tensor> lltm_op_backward(torch::Tensor grad_h,
+					    torch::Tensor grad_cell,
+					    torch::Tensor new_cell,
+					    torch::Tensor input_gate,
+					    torch::Tensor output_gate,
+					    torch::Tensor candidate_cell,
+					    torch::Tensor X,
+					    torch::Tensor gate_weights,
+					    torch::Tensor weights){
+  static auto op = torch::Dispatcher::singleton()
+    .findSchemaOrThrow("myops::lltm", "backward")
+    .typed<decltype(lltm_op_backward)>();
+  return op.call(grad_h, grad_cell, new_cell, input_gate,
+		 output_gate, candidate_cell, X, gate_weights, weights);
+}
+
+class LLTMFunction : public torch::autograd::Function<LLTMFunction> {
+public:
+  static std::vector<torch::Tensor> forward(torch::autograd::AutogradContext *ctx,
+					    torch::Tensor input,
+					    torch::Tensor weights,
+					    torch::Tensor bias,
+					    torch::Tensor old_h,
+					    torch::Tensor old_cell){
+    at::AutoDispatchBelowADInplaceOrView g;
+    std::vector<torch::Tensor> outputs = lltm_op(input, weights, bias, old_h, old_cell);
+    ctx->save_for_backward({outputs[1], outputs[2], outputs[3],
+	outputs[4], outputs[5], outputs[6], weights});
+        
+    return {outputs[0], outputs[1]};
+  }
+
+  static torch::autograd::tensor_list backward(torch::autograd::AutogradContext *ctx,
+					       torch::autograd::tensor_list grad_outputs){
+    auto saved = ctx->get_saved_variables();
+    auto outputs = lltm_op_backward(grad_outputs[0].contiguous(),
+				    grad_outputs[1].contiguous(),
+				    saved[0], saved[1], saved[2], saved[3],
+				    saved[4], saved[5], saved[6]);
+    return {outputs[1], outputs[2], outputs[3], outputs[0], outputs[4]};
+  }
+};
+
+std::vector<torch::Tensor> lltm_autograd(torch::Tensor input,
+					 torch::Tensor weights,
+					 torch::Tensor bias,
+					 torch::Tensor old_h,
+					 torch::Tensor old_cell) {
+  return LLTMFunction::apply(input, weights, bias, old_h, old_cell);
+}
+
+TORCH_LIBRARY(myops, m){
+  m.def("lltm(Tensor input, Tensor weights, Tensor bias, Tensor old_h, Tensor old_cell)" \
+	"-> Tensor[]");
+  m.def("lltm.backward(Tensor grad_h, Tensor grad_cell, Tensor new_cell, " \
+	"Tensor input_gate, Tensor output_gate, Tensor candidate_cell, Tensor X, " \
+	"Tensor gate_weights, Tensor weights) -> Tensor[]");
+}
+
+TORCH_LIBRARY_IMPL(myops, CPU, m){
+  m.impl(TORCH_SELECTIVE_NAME("lltm"), TORCH_FN(lltm_forward));
+  m.impl(TORCH_SELECTIVE_NAME("lltm.backward"), TORCH_FN(lltm_backward));
+}
+
+TORCH_LIBRARY_IMPL(myops, Autograd, m) {
+  m.impl("lltm", lltm_autograd);
 }
diff --git a/cpp/lltm.py b/cpp/lltm.py
index 24cf82d..79d6e87 100644
--- a/cpp/lltm.py
+++ b/cpp/lltm.py
@@ -1,30 +1,23 @@
 import math
+import os
 from torch import nn
 from torch.autograd import Function
+import glob
 import torch
-
-import lltm_cpp
+import torch.utils.cpp_extension
+import pkg_resources
+
+# Get the location of shared library for the lltm op, and load it.
+LIB_EXT = torch.utils.cpp_extension.LIB_EXT
+cpp_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cpp'), "lltm_cpp.py"))
+cpp_lib_path = glob.glob(
+    os.path.join(cpp_module_path, f"lltm_cpp*{LIB_EXT}"))[0]
+torch.ops.load_library(cpp_lib_path)
 
 torch.manual_seed(42)
 
-
-class LLTMFunction(Function):
-    @staticmethod
-    def forward(ctx, input, weights, bias, old_h, old_cell):
-        outputs = lltm_cpp.forward(input, weights, bias, old_h, old_cell)
-        new_h, new_cell = outputs[:2]
-        variables = outputs[1:] + [weights]
-        ctx.save_for_backward(*variables)
-
-        return new_h, new_cell
-
-    @staticmethod
-    def backward(ctx, grad_h, grad_cell):
-        d_old_h, d_input, d_weights, d_bias, d_old_cell = lltm_cpp.backward(
-            grad_h, grad_cell, *ctx.saved_variables)
-        return d_input, d_weights, d_bias, d_old_h, d_old_cell
-
-
 class LLTM(nn.Module):
     def __init__(self, input_features, state_size):
         super(LLTM, self).__init__()
@@ -41,4 +34,4 @@ def reset_parameters(self):
             weight.data.uniform_(-stdv, +stdv)
 
     def forward(self, input, state):
-        return LLTMFunction.apply(input, self.weights, self.bias, *state)
+        return torch.ops.myops.lltm(input, self.weights, self.bias, *state)
diff --git a/cpp/setup.py b/cpp/setup.py
index 7a4c164..d663805 100644
--- a/cpp/setup.py
+++ b/cpp/setup.py
@@ -4,7 +4,7 @@
 setup(
     name='lltm_cpp',
     ext_modules=[
-        CppExtension('lltm_cpp', ['lltm.cpp']),
+        CppExtension('lltm_cpp', ['lltm.cpp'], library_dirs=['/lib/x86_64-linux-gnu/'], runtime_library_dirs=['/lib/x86_64-linux-gnu/']),
     ],
     cmdclass={
         'build_ext': BuildExtension
diff --git a/cuda/lltm.py b/cuda/lltm.py
index c740b88..5fa07f2 100644
--- a/cuda/lltm.py
+++ b/cuda/lltm.py
@@ -1,31 +1,29 @@
 import math
+import os
 from torch import nn
 from torch.autograd import Function
 import torch
-
-import lltm_cuda
+import glob
+import torch.utils.cpp_extension
+import pkg_resources
+
+# Get the location of shared library for the lltm op, and load it.
+LIB_EXT = torch.utils.cpp_extension.LIB_EXT
+# Note: currently there is a dependency on the CPP lib, due to the schema definition
+# Eventually, this should move to use a single library registering both CPP and CUDA ops
+cpp_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cpp'), "lltm_cpp.py"))
+cpp_lib_path = glob.glob(os.path.join(cpp_module_path, f"lltm_cpp*{LIB_EXT}"))[0]
+torch.ops.load_library(cpp_lib_path)
+cuda_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cuda'), "lltm_cuda.py"))
+cuda_lib_path = glob.glob(os.path.join(cuda_module_path, f"lltm_cuda*{LIB_EXT}"))[0]
+torch.ops.load_library(cuda_lib_path)
 
 torch.manual_seed(42)
 
-
-class LLTMFunction(Function):
-    @staticmethod
-    def forward(ctx, input, weights, bias, old_h, old_cell):
-        outputs = lltm_cuda.forward(input, weights, bias, old_h, old_cell)
-        new_h, new_cell = outputs[:2]
-        variables = outputs[1:] + [weights]
-        ctx.save_for_backward(*variables)
-
-        return new_h, new_cell
-
-    @staticmethod
-    def backward(ctx, grad_h, grad_cell):
-        outputs = lltm_cuda.backward(
-            grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
-        d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates = outputs
-        return d_input, d_weights, d_bias, d_old_h, d_old_cell
-
-
 class LLTM(nn.Module):
     def __init__(self, input_features, state_size):
         super(LLTM, self).__init__()
@@ -42,4 +40,4 @@ def reset_parameters(self):
             weight.data.uniform_(-stdv, +stdv)
 
     def forward(self, input, state):
-        return LLTMFunction.apply(input, self.weights, self.bias, *state)
+        return torch.ops.myops.lltm(input, self.weights, self.bias, *state)
diff --git a/cuda/lltm_cuda.cpp b/cuda/lltm_cuda.cpp
index 2434776..e907161 100644
--- a/cuda/lltm_cuda.cpp
+++ b/cuda/lltm_cuda.cpp
@@ -35,6 +35,7 @@ std::vector<torch::Tensor> lltm_forward(
     torch::Tensor bias,
     torch::Tensor old_h,
     torch::Tensor old_cell) {
+   
   CHECK_INPUT(input);
   CHECK_INPUT(weights);
   CHECK_INPUT(bias);
@@ -75,7 +76,8 @@ std::vector<torch::Tensor> lltm_backward(
       weights);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &lltm_forward, "LLTM forward (CUDA)");
-  m.def("backward", &lltm_backward, "LLTM backward (CUDA)");
+TORCH_LIBRARY_IMPL(myops, CUDA, m){
+  m.impl(TORCH_SELECTIVE_NAME("lltm"), TORCH_FN(lltm_forward));
+  m.impl(TORCH_SELECTIVE_NAME("lltm.backward"), TORCH_FN(lltm_backward));
 }
+
diff --git a/grad_check.py b/grad_check.py
index caf3b36..5f097fc 100644
--- a/grad_check.py
+++ b/grad_check.py
@@ -2,8 +2,11 @@
 from __future__ import print_function
 
 import argparse
+import os
+import pkg_resources
 import torch
 from torch.autograd import gradcheck
+import glob
 
 parser = argparse.ArgumentParser()
 parser.add_argument('example', choices=['py', 'cpp', 'cuda'])
@@ -13,13 +16,26 @@
 parser.add_argument('-c', '--cuda', action='store_true')
 options = parser.parse_args()
 
+cpp_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cpp'), "lltm_cpp.py"))
+cpp_lib_path = glob.glob(os.path.join(cpp_module_path, "lltm_cpp*.so"))[0]
+torch.ops.load_library(cpp_lib_path)
+
+cuda_module_path = os.path.dirname(
+    pkg_resources.resource_filename(
+        pkg_resources.Requirement.parse('lltm_cuda'), "lltm_cuda.py"))
+cuda_lib_path = glob.glob(os.path.join(cuda_module_path, "lltm_cuda*.so"))[0]
+torch.ops.load_library(cuda_lib_path)
+
+
 if options.example == 'py':
     from python.lltm_baseline import LLTMFunction
-elif options.example == 'cpp':
-    from cpp.lltm import LLTMFunction
+    lltm_func = LLTMFunction.apply
 else:
-    from cuda.lltm import LLTMFunction
-    options.cuda = True
+    lltm_func = torch.ops.myops.lltm
+
+options.cuda |= (options.example == "cuda")
 
 device = torch.device("cuda") if options.cuda else torch.device("cpu")
 
@@ -30,11 +46,13 @@
 X = torch.randn(options.batch_size, options.features, **kwargs)
 h = torch.randn(options.batch_size, options.state_size, **kwargs)
 C = torch.randn(options.batch_size, options.state_size, **kwargs)
-W = torch.randn(3 * options.state_size, options.features + options.state_size, **kwargs)
+W = torch.randn(3 * options.state_size,
+                options.features + options.state_size,
+                **kwargs)
 b = torch.randn(1, 3 * options.state_size, **kwargs)
 
 variables = [X, W, b, h, C]
 
 
-if gradcheck(LLTMFunction.apply, variables):
+if gradcheck(lltm_func, variables):
     print('Ok')