Add method to start prep_train_data job

lewfish · lewfish · commit 024b5f5df2fa · 2017-10-18T11:16:08.000-04:00
* Adds a method to start a prep_train_data raster-vision job, which makes
it easy to generate a training data zip file given a list of projects and annotation
URIs.
* Moves some raster-vision utility functions and constants into utils.py and settings.py files.
diff --git a/rasterfoundry/api.py b/rasterfoundry/api.py
@@ -1,4 +1,5 @@
 import os
+import uuid
 
 from bravado.requests_client import RequestsClient
 from bravado.client import SwaggerClient
@@ -7,7 +8,9 @@
 
 from .models import Project, MapToken
 from .exceptions import RefreshTokenException
-
+from .utils import start_raster_vision_job, upload_raster_vision_config
+from .settings import (
+    RV_CPU_JOB_DEF, RV_CPU_QUEUE, DEVELOP_BRANCH, RV_CONFIG_URI_ROOT)
 
 SPEC_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                          'spec.yml')
@@ -125,3 +128,67 @@ def get_scenes(self, **kwargs):
         elif bbox and type(bbox) != type(','.join(str(x) for x in bbox)): # NOQA
             kwargs['bbox'] = ','.join(str(x) for x in bbox)
         return self.client.Imagery.get_scenes(**kwargs).result()
+
+    def get_project_configs(self, project_ids, annotation_uris):
+        """Get data needed to create project config file for prep_train_data
+
+        The prep_train_data script requires a project config files which
+        lists the images and annotation URIs associated with each project
+        that will be used to generate training data.
+
+        Args:
+            project_ids: list of project ids to make training data from
+            annotation_uris: list of corresponding annotation URIs
+
+        Returns:
+            Object of form [{'images': [...], 'annotations':...}, ...]
+        """
+        project_configs = []
+        for project_id, annotation_uri in zip(project_ids, annotation_uris):
+            proj = Project(
+                self.client.Imagery.get_projects_uuid(uuid=project_id).result(),
+                self)
+            image_uris = proj.get_image_source_uris()
+            project_configs.append({
+                'images': image_uris,
+                'annotations': annotation_uri
+            })
+
+        return project_configs
+
+    def start_prep_train_data_job(self, project_ids, annotation_uris,
+                                  output_zip_uri,
+                                  config_uri_root=RV_CONFIG_URI_ROOT,
+                                  job_queue=RV_CPU_QUEUE,
+                                  job_definition=RV_CPU_JOB_DEF,
+                                  branch_name=DEVELOP_BRANCH, attempts=1):
+        """Start a Batch job to prepare object detection training data.
+
+        Args:
+            project_ids (list of str): ids of projects to make train data for
+            annotation_uris (list of str): annotation URIs for projects
+            output_zip_uri (str): URI of output zip file
+            config_uri_root (str): The root of generated URIs for config files
+            job_queue (str): name of the Batch job queue to run the job in
+            job_definition (str): name of the Batch job definition
+            branch_name (str): branch of the raster-vision repo to use
+            attempts (int): number of attempts for the Batch job
+
+        Returns:
+            job_id (str): job_id of job started on Batch
+        """
+        project_configs = self.get_project_configs(
+            project_ids, annotation_uris)
+        config_uri = upload_raster_vision_config(
+            project_configs, config_uri_root)
+
+        command = ('python -m rv.run prep_train_data --debug ' +
+                   '--chip-size 300 --num-neg-chips 100 ' +
+                   '--max-attempts 500 {} {}').format(
+                       config_uri, output_zip_uri)
+
+        job_name = 'prep_train_data_{}'.format(uuid.uuid1())
+        job_id = start_raster_vision_job(
+            job_name, command, job_queue, job_definition, branch_name,
+            attempts=attempts)
+        return job_id
diff --git a/rasterfoundry/models/project.py b/rasterfoundry/models/project.py
@@ -2,56 +2,21 @@
 import requests
 import uuid
 
-import boto3
-
 from .. import NOTEBOOK_SUPPORT
 from ..decorators import check_notebook
 from ..exceptions import GatewayTimeoutException
+from ..utils import start_raster_vision_job
+from ..settings import RV_CPU_JOB_DEF, RV_CPU_QUEUE, DEVELOP_BRANCH
 from .map_token import MapToken
 
+
 if NOTEBOOK_SUPPORT:
     from ipyleaflet import (
         Map,
         SideBySideControl,
         TileLayer,
     )
 
-RV_CPU_QUEUE = 'raster-vision-cpu'
-RV_CPU_JOB_DEF = 'raster-vision-cpu'
-DEVELOP_BRANCH = 'develop'
-
-
-def start_raster_vision_job(job_name, command, job_queue=RV_CPU_QUEUE,
-                            job_definition=RV_CPU_JOB_DEF,
-                            branch_name=DEVELOP_BRANCH, attempts=1):
-    """Start a raster-vision Batch job.
-
-    Args:
-        job_name (str): name of the Batch job
-        command (str): command to run inside the Docker container
-        job_queue (str): name of the Batch job queue to run the job in
-        job_definition (str): name of the Batch job definition
-        branch_name (str): branch of the raster-vision repo to use
-        attempts (int): number of attempts for the Batch job
-
-    Returns:
-        job_id (str): job_id of job started on Batch
-    """
-    batch_client = boto3.client('batch')
-    # `run_script.sh $branch_name $command` downloads a branch of the
-    # raster-vision repo and then runs the command.
-    job_command = ['run_script.sh', branch_name, command]
-    job_id = batch_client.submit_job(
-        jobName=job_name, jobQueue=job_queue, jobDefinition=job_definition,
-        containerOverrides={
-            'command': job_command
-        },
-        retryStrategy={
-            'attempts': attempts
-        })['jobId']
-
-    return job_id
-
 
 class Project(object):
     """A Raster Foundry project"""
diff --git a/rasterfoundry/settings.py b/rasterfoundry/settings.py
@@ -0,0 +1,4 @@
+RV_CPU_QUEUE = 'raster-vision-cpu'
+RV_CPU_JOB_DEF = 'raster-vision-cpu'
+RV_CONFIG_URI_ROOT = 's3://raster-vision/datasets/detection/configs'
+DEVELOP_BRANCH = 'develop'
diff --git a/rasterfoundry/utils.py b/rasterfoundry/utils.py
@@ -0,0 +1,65 @@
+from future.standard_library import install_aliases  # noqa
+install_aliases()  # noqa
+from urllib.parse import urlparse
+from os.path import join
+import tempfile
+import uuid
+import json
+
+import boto3
+
+
+def start_raster_vision_job(job_name, command, job_queue, job_definition,
+                            branch_name, attempts=1):
+    """Start a raster-vision Batch job.
+
+    Args:
+        job_name (str): name of the Batch job
+        command (str): command to run inside the Docker container
+        job_queue (str): name of the Batch job queue to run the job in
+        job_definition (str): name of the Batch job definition
+        branch_name (str): branch of the raster-vision repo to use
+        attempts (int): number of attempts for the Batch job
+
+    Returns:
+        job_id (str): job_id of job started on Batch
+    """
+    batch_client = boto3.client('batch')
+    # `run_script.sh $branch_name $command` downloads a branch of the
+    # raster-vision repo and then runs the command.
+    job_command = ['run_script.sh', branch_name, command]
+    job_id = batch_client.submit_job(
+        jobName=job_name, jobQueue=job_queue, jobDefinition=job_definition,
+        containerOverrides={
+            'command': job_command
+        },
+        retryStrategy={
+            'attempts': attempts
+        })['jobId']
+
+    return job_id
+
+
+def upload_raster_vision_config(config_dict, config_uri_root):
+    """Upload a config file to S3
+
+    Args:
+        config_dict: a dictionary to turn into a JSON file to upload
+        config_uri_root: the root of the URI to upload the config to
+
+    Returns:
+        remote URI of the config file generate using a UUID
+    """
+    with tempfile.NamedTemporaryFile('w') as config_file:
+        json.dump(config_dict, config_file)
+        config_uri = join(
+            config_uri_root, '{}.json'.format(uuid.uuid1()))
+        s3 = boto3.resource('s3')
+        parsed_uri = urlparse(config_uri)
+        # Rewind file to beginning so that full content will be loaded.
+        # Without this line 0 bytes are uploaded.
+        config_file.seek(0)
+        s3.meta.client.upload_file(
+            config_file.name, parsed_uri.netloc, parsed_uri.path[1:])
+
+        return config_uri
diff --git a/setup.py b/setup.py
@@ -25,7 +25,8 @@
         'pyasn1 >= 0.2.3',
         'requests >= 2.9.1',
         'bravado >= 8.4.0',
-        'boto3 >= 1.4.4'
+        'boto3 >= 1.4.4',
+        'future >= 0.16.0'
     ],
     extras_require={
         'notebook': [