🔀 Merge branch 'round_grids_to_250' (#155)

Closes #155 Re-grid and re-tile groundtruth data with coordinates rounded to 250.
weiji14 · Jun 21, 2019 · 259398c · 259398c
2 parents 1edb16e + 77b4fe1
commit 259398c
Show file tree

Hide file tree

Showing 10 changed files with 4,911 additions and 5,066 deletions.
diff --git a/data_prep.ipynb b/data_prep.ipynb
diff --git a/data_prep.py b/data_prep.py
@@ -30,6 +30,7 @@
 import shutil
 import sys
 import tarfile
+import tempfile
 import urllib
 import yaml
 import zipfile
@@ -306,6 +307,7 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
         )
         for f in files
     )
+    df.dropna(axis="index", inplace=True)  # drop rows with NaN values
     df.reset_index(drop=True, inplace=True)  # reset index after concatenation
 
     ## Advanced table read with conversions
@@ -354,19 +356,32 @@ def ascii_to_xyz(pipeline_file: str) -> pd.DataFrame:
 # ![Clean XYZ Table to Raster Grid via interpolation function](https://yuml.me/diagram/scruffy;dir:LR/class/[Clean-XYZ-Table|*.xyz]->[Interpolation-Function],[Interpolation-Function]->[Raster-Grid|*.tif/*.nc])
 
 # %%
-def get_region(xyz_data: pd.DataFrame) -> str:
+def get_region(xyz_data: pd.DataFrame, round_increment: int = 250) -> str:
     """
-    Gets the bounding box region of an xyz pandas.DataFrame in string
-    format xmin/xmax/ymin/ymax rounded to 5 decimal places.
-    Used for the -R 'region of interest' parameter in GMT.
-
-    >>> xyz_data = pd.DataFrame(np.random.RandomState(seed=42).rand(30).reshape(10, 3))
+    Gets an extended bounding box region for points in an xyz pandas.DataFrame with
+    columns x, y, and z. The coordinates will be rounded to values specified by the
+    round_increment parameter. Implementation uses gmt.info with the -I (increment)
+    setting, see also https://gmt.soest.hawaii.edu/doc/latest/gmtinfo.html#i
+
+    The output region is returned in a string format 'xmin/xmax/ymin/ymax' directly
+    usable as the -R 'region of interest' parameter in GMT. Indeed, the rounding is
+    specifically optimized to give grid dimensions for fastest results in programs like
+    GMT surface.
+
+    >>> xyz_data = pd.DataFrame(
+    ...     10000 * np.random.RandomState(seed=42).rand(30).reshape(10, 3),
+    ...     columns=["x", "y", "z"],
+    ... )
     >>> get_region(xyz_data=xyz_data)
-    '0.05808/0.83244/0.02058/0.95071'
+    '-250/9500/0/9750'
     """
-    xmin, ymin, _ = xyz_data.min(axis="rows")
-    xmax, ymax, _ = xyz_data.max(axis="rows")
-    return f"{xmin:.5f}/{xmax:.5f}/{ymin:.5f}/{ymax:.5f}"
+    assert (xyz_data.columns == pd.Index(data=["x", "y", "z"], dtype="object")).all()
+
+    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
+        xyz_data.to_csv(tmpfile.name, header=False, index=False)
+        region = gmt.info(fname=tmpfile.name, I=f"s{round_increment}").strip()[2:]
+
+    return region
 
 
 # %%
@@ -381,18 +396,19 @@ def xyz_to_grid(
     """
     Performs interpolation of x, y, z point data to a raster grid.
 
-    >>> xyz_data = 1000*pd.DataFrame(np.random.RandomState(seed=42).rand(60).reshape(20, 3))
+    >>> xyz_data = pd.DataFrame(
+    ...     600 * np.random.RandomState(seed=42).rand(60).reshape(20, 3),
+    ...     columns=["x", "y", "z"],
+    ... )
     >>> region = get_region(xyz_data=xyz_data)
     >>> grid = xyz_to_grid(xyz_data=xyz_data, region=region, spacing=250)
     >>> grid.to_array().shape
-    (1, 5, 5)
+    (1, 4, 4)
     >>> grid.to_array().values
-    array([[[403.17618 , 544.92535 , 670.7824  , 980.75055 , 961.47723 ],
-            [379.0757  , 459.26407 , 314.38297 , 377.78555 , 546.0469  ],
-            [450.67664 , 343.26    ,  88.391594, 260.10492 , 452.3337  ],
-            [586.09906 , 469.74008 , 216.8168  , 486.9802  , 642.2116  ],
-            [451.4794  , 652.7244  , 325.77896 , 879.8973  , 916.7921  ]]],
-          dtype=float32)
+    array([[[135.95927, 294.8152 , 615.32513, 706.4135 ],
+            [224.46773, 191.75693, 298.87057, 521.85254],
+            [242.41916, 149.34332, 430.84137, 603.4236 ],
+            [154.60516, 194.24237, 447.6185 , 645.13574]]], dtype=float32)
     """
     ## Preprocessing with blockmedian
     with gmt.helpers.GMTTempFile(suffix=".txt") as tmpfile:
@@ -414,7 +430,7 @@ def xyz_to_grid(
         region=region,
         spacing=f"{spacing}+e",
         T=tension,
-        V="",
+        V="n",  # normal verbosity: produce only fatal error messages
         M=f"{mask_cell_radius}c",
     )
 
@@ -435,7 +451,7 @@ def xyz_to_grid(
     grid_dict[name] = xyz_to_grid(
         xyz_data=xyz_data, region=region, outfile=f"highres/{name}.nc"
     )
-    print(f"done! {grid_dict[name].to_array().shape}")
+    print(f"done! {grid_dict[name].z.shape}")
 
 # %% [markdown]
 # ### 2.3 Plot raster grids

diff --git a/deepbedmap.ipynb b/deepbedmap.ipynb
diff --git a/deepbedmap.py b/deepbedmap.py
@@ -108,11 +108,10 @@ def get_image_with_bounds(filepaths: list, indexers: dict = None) -> xr.DataArra
 
 
 # %%
-test_filepaths = ["highres/2007tx", "highres/2010tr", "highres/istarxx"]
+test_filepaths = ["highres/2007tx"]  # , "highres/2010tr", "highres/istarxx"]
 groundtruth = get_image_with_bounds(
     filepaths=[f"{t}.nc" for t in test_filepaths],
-    indexers={"y": slice(0, -1), "x": slice(0, -1)},  # for 2007tx, 2010tr and istarxx
-    # indexers={"y": slice(1, -1), "x": slice(1, -1)},  # for 2007tx only
+    # indexers={"y": slice(0, -1), "x": slice(0, -1)},  # for 2007tx, 2010tr and istarxx
 )
 
 # %% [markdown]

diff --git a/features/environment.py b/features/environment.py
@@ -84,10 +84,13 @@ def _quick_download_lowres_misc_datasets():
         print("done!")
 
 
-def _download_deepbedmap_model_weights_from_comet():
+def _download_deepbedmap_model_weights_from_comet(experiment_key: str = "latest"):
     """
-    Download latest neural network model weights from Comet.ML
-    Uses their Python REST API class at https://www.comet.ml/docs/python-sdk/API/
+    Download DeepBedMap's Generator neural network model weights from Comet.ML
+    By default, the model weights from the latest experimental run are downloaded
+    Passing in an alternative experiment_key hash will download that one instead
+
+    Uses Comet.ML's Python REST API class at https://www.comet.ml/docs/python-sdk/API/
     Requires the COMET_REST_API_KEY environment variable to be set in the .env file
     """
     comet_api = comet_ml.API(
@@ -99,7 +102,8 @@ def _download_deepbedmap_model_weights_from_comet():
     df = pd.io.json.json_normalize(data=project.data["experiments"].values())
 
     # Get the key to the latest DeepBedMap experiment on Comet ML
-    experiment_key = df.loc[df["start_server_timestamp"].idxmax()].experiment_key
+    if experiment_key == "latest":
+        experiment_key = df.loc[df["start_server_timestamp"].idxmax()].experiment_key
     experiment = comet_api.get(
         workspace="weiji14", project="deepbedmap", experiment=experiment_key
     )

diff --git a/features/steps/test_data_prep.py b/features/steps/test_data_prep.py
@@ -45,7 +45,8 @@ def process_data_through_pipeline_and_get_output(context, pipeline_file):
 @when("interpolate the xyz data table to {output_file}")
 def interpolate_xyz_data_to_grid(context, output_file):
     region = context.data_prep.get_region(context.xyz_data)
-    context.outfile = os.path.join("highres", output_file)
+    context.outfile = os.path.join("/tmp/highres", output_file)
+    os.makedirs(os.path.dirname(context.outfile), exist_ok=True)
     context.data_prep.xyz_to_grid(
         xyz_data=context.xyz_data, region=region, outfile=context.outfile
     )
@@ -60,7 +61,8 @@ def open_raster_grid_to_check(context):
 @given("a big {dataset_type} raster grid {raster_grid}")
 def get_a_raster_grid(context, dataset_type, raster_grid):
     context.raster_grid = raster_grid
-    context.filepath = os.path.join(dataset_type, raster_grid)
+    context.filepath = os.path.join("/tmp", dataset_type, raster_grid)
+    os.makedirs(os.path.dirname(context.filepath), exist_ok=True)
     url = (
         f"https://github.com/weiji14/deepbedmap/releases/download/v0.7.0/{raster_grid}"
     )

diff --git a/misc/README.md b/misc/README.md
@@ -6,6 +6,6 @@ Filename|Location|Resolution|Literature Citation|Data Citation
 ---|---|---|---|---
 Arthern_accumulation_bedmap2_grid1.tif|Antarctica|1000m|[Arthern2006Accumulation](https://doi.org/10.1029/2004JD005667)|
 lisa750_2013182_2017120_0000_0400_vv_v1.tif|Antarctica|750m|[Fahnestock2019LISA](https://doi.org/10.1016/j.rse.2015.11.023)|[DOI](https://doi.org/10.7265/nxpc-e997)
-3 *.dbf files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
+3 *.shx files|Antarctica|nan|[Mouginot2017MEASURES](https://doi.org/10.1126/science.1235798)|[DOI](https://doi.org/10.5067/AXE4121732AD)
 2 *.tif files|Antarctica|100m|[Noh2018REMA](https://doi.org/10.1016/j.isprsjprs.2017.12.008)|[DOI](https://doi.org/10.7910/DVN/SAIK8B)
 MEaSUREs_IceFlowSpeed_450m.tif|Antarctica|450m|[Rignot2011MEASURES](https://doi.org/10.1126/science.1208336)|[DOI](https://doi.org/10.5067/D7GK8F5J8M8R)