Source code for clouddrift.adapters.laser

"""Adapt the LASER 15-minute interpolated drifter trajectories to RaggedArray.

The upstream dataset is hosted by GRIIDC at https://doi.org/10.7266/N7W0940J
and distributed as a zip archive containing the ASCII drifter trajectories file
and a README.

Example
-------
>>> from clouddrift.adapters import laser
>>> ra = laser.to_raggedarray()
>>> ds = ra.to_xarray()

References
----------
Eric D'Asaro, Cedric Guigand, Angelique Haza, Helga Huntley, Guillaume Novelli,
Tamay Ozgokmen, Ed Ryan. 2017. Lagrangian Submesoscale Experiment (LASER)
surface drifters, interpolated to 15-minute intervals. Distributed by: GRIIDC,
Harte Research Institute, Texas A&M University-Corpus Christi.
https://doi.org/10.7266/N7W0940J
"""

import os
import tempfile
from zipfile import ZipFile

import numpy as np
import pandas as pd

from clouddrift.adapters.utils import download_with_progress
from clouddrift.raggedarray import RaggedArray

_DATASET_TITLE = (
    "Lagrangian Submesoscale Experiment (LASER) surface drifters, "
    "interpolated to 15-minute intervals"
)
_DATASET_PAGE = "https://data.griidc.org/data/R4.x265.237:0001"
_DOWNLOAD_URL = "https://data.griidc.org/api/datasets/zip/2101"
_DATA_FILENAME = "laser_spot_drifters_clean_v15.dat"
LASER_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "laser")
_LOCAL_ARCHIVE_NAME = "laser_surface_drifters.zip"


def _open_datafile(archive: ZipFile):
    for member in archive.namelist():
        if member == _DATA_FILENAME or member.endswith(f"/{_DATA_FILENAME}"):
            return archive.open(member)

    raise FileNotFoundError(f"Could not find '{_DATA_FILENAME}' in LASER dataset archive.")



[docs]
def get_dataframe(
    tmp_path: str | None = None,
    skip_download: bool = False,
) -> pd.DataFrame:
    """Get the LASER dataset as a pandas DataFrame.

    Parameters
    ----------
    tmp_path : str, optional
        Temporary path where intermediary files are stored. If None, uses the
        default LASER adapter temp path.
    skip_download : bool, optional
        If True, skip re-downloading the archive if it already exists in
        ``tmp_path``. Default is False.
    """
    if tmp_path is None:
        tmp_path = LASER_TMP_PATH
    os.makedirs(tmp_path, exist_ok=True)

    local_zip = os.path.join(tmp_path, _LOCAL_ARCHIVE_NAME)
    download_with_progress([(_DOWNLOAD_URL, local_zip)], skip_download=skip_download)

    column_names = [
        "id",
        "date",
        "time",
        "latitude",
        "longitude",
        "position_error",
        "u",
        "v",
        "velocity_error",
    ]

    with ZipFile(local_zip) as archive:
        with _open_datafile(archive) as data_file:
            df = pd.read_csv(
                data_file,
                sep=r"\s+",
                comment="%",
                names=column_names,
            )

    df["obs"] = pd.to_datetime(df["date"] + " " + df["time"])
    df = df.drop(columns=["date", "time"])
    return df.sort_values(["id", "obs"], kind="stable").reset_index(drop=True)



def _dataframe_to_raggedarray(df: pd.DataFrame) -> RaggedArray:
    ids = df["id"].to_numpy()
    traj, rowsize = np.unique(ids, return_counts=True)

    attrs_global = {
        "title": _DATASET_TITLE,
        "institution": "Consortium for Advanced Research on Transport of Hydrocarbon in the Environment (CARTHE)",
        "source": "SPOT GPS drifters",
        "history": f"Downloaded from {_DATASET_PAGE} and post-processed into a ragged-array Xarray Dataset by CloudDrift",
        "references": "Eric D'Asaro, Cedric Guigand, Angelique Haza, Helga Huntley, Guillaume Novelli, Tamay Ozgokmen, Ed Ryan. 2017. Lagrangian Submesoscale Experiment (LASER) surface drifters, interpolated to 15-minute intervals. Distributed by: GRIIDC, Harte Research Institute, Texas A&M University-Corpus Christi. https://doi.org/10.7266/N7W0940J",
    }

    attrs_variables = {
        "id": {
            "long_name": "trajectory identifier",
            "comment": (
                "String ID encoding drogue status and continuity. "
                "Prefix 'L'/'M': drogued (original/cut continuation); "
                "prefix 'U'/'V': undrogued (original/cut continuation). "
                "Trajectories sharing the same integer number are from the same drifter."
            ),
        },
        "time": {
            "long_name": "time",
        },
        "rowsize": {
            "long_name": "number of observations for each trajectory",
        },
        "longitude": {
            "long_name": "longitude",
            "standard_name": "longitude",
            "units": "degrees_east",
        },
        "latitude": {
            "long_name": "latitude",
            "standard_name": "latitude",
            "units": "degrees_north",
        },
        "position_error": {
            "long_name": "position_error",
            "units": "m",
        },
        "u": {
            "long_name": "eastward_sea_water_velocity",
            "standard_name": "eastward_sea_water_velocity",
            "units": "m s-1",
        },
        "v": {
            "long_name": "northward_sea_water_velocity",
            "standard_name": "northward_sea_water_velocity",
            "units": "m s-1",
        },
        "velocity_error": {
            "long_name": "velocity_error",
            "units": "m s-1",
        },
    }

    return RaggedArray(
        coords={
            "id": traj,
            "time": df["obs"].to_numpy(dtype="datetime64[ns]"),
        },
        metadata={
            "rowsize": rowsize.astype("int64"),
        },
        data={
            "latitude": df["latitude"].to_numpy(dtype="float32"),
            "longitude": df["longitude"].to_numpy(dtype="float32"),
            "position_error": df["position_error"].to_numpy(dtype="float32"),
            "u": df["u"].to_numpy(dtype="float32"),
            "v": df["v"].to_numpy(dtype="float32"),
            "velocity_error": df["velocity_error"].to_numpy(dtype="float32"),
        },
        attrs_global=attrs_global,
        attrs_variables=attrs_variables,
        name_dims={"traj": "rows", "obs": "obs"},
        coord_dims={"id": "traj", "time": "obs"},
        var_dims={
            "rowsize": ["traj"],
            "latitude": ["obs"],
            "longitude": ["obs"],
            "position_error": ["obs"],
            "u": ["obs"],
            "v": ["obs"],
            "velocity_error": ["obs"],
        },
    )



[docs]
def to_raggedarray(
    tmp_path: str | None = None,
    skip_download: bool = False,
) -> RaggedArray:
    """Return the LASER dataset as a RaggedArray instance.

    Parameters
    ----------
    tmp_path : str, optional
        Temporary path where intermediary files are stored. If None, uses the
        default LASER adapter temp path.
    skip_download : bool, optional
        If True, skip re-downloading the archive if it already exists in
        ``tmp_path``. Default is False.
    """
    df = get_dataframe(tmp_path=tmp_path, skip_download=skip_download)
    return _dataframe_to_raggedarray(df)