Source code for clouddrift.adapters.andro

"""
This module defines functions used to adapt the ANDRO: An Argo-based
deep displacement dataset as a ragged-arrays dataset.

The dataset is hosted at https://www.seanoe.org/data/00360/47077/ and the user manual
is available at https://archimer.ifremer.fr/doc/00360/47126/.

Example
-------
>>> from clouddrift.adapters import andro
>>> ra = andro.to_raggedarray()
>>> ds = ra.to_xarray()

Reference
---------
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
SEANOE. https://doi.org/10.17882/47077
"""

import os
import tempfile
from datetime import datetime

import numpy as np
import pandas as pd

from clouddrift.adapters.utils import download_with_progress
from clouddrift.raggedarray import RaggedArray

# order of the URLs is important
ANDRO_URL = "https://www.seanoe.org/data/00360/47077/data/127690.zip"
ANDRO_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "andro")
ANDRO_VERSION = "2026-04"


[docs] def to_raggedarray(tmp_path: str | None = None, skip_download: bool = False) -> RaggedArray: """Return the ANDRO dataset as a RaggedArray instance. Parameters ---------- tmp_path : str, optional Path where the dataset file is cached. Defaults to a platform-specific temporary directory. skip_download : bool, optional If True, skip re-downloading the dataset file if it already exists in ``tmp_path``. Default is False. """ if tmp_path is None: tmp_path = ANDRO_TMP_PATH os.makedirs(tmp_path, exist_ok=True) # get or update dataset local_file = f"{tmp_path}/{ANDRO_URL.split('/')[-1]}" download_with_progress([(ANDRO_URL, local_file)], skip_download=skip_download) # parse with pandas col_names = [ # depth "lon_d", "lat_d", "pres_d", "temp_d", "sal_d", "time_d", "ve_d", "vn_d", "err_ve_d", "err_vn_d", # first surface velocity "lon_s", "lat_s", "time_s", "ve_s", "vn_s", "err_ve_s", "err_vn_s", # last surface velocity "lon_ls", "lat_ls", "time_ls", "ve_ls", "vn_ls", "err_ve_ls", "err_vn_ls", # last fix previous cycle "lon_lp", "lat_lp", "time_lp", # first fix current cycle "lon_fc", "lat_fc", "time_fc", # last fix current cycle "lon_lc", "lat_lc", "time_lc", "surf_fix", "id", "cycle", "profile_id", ] na_col = [ -999.9999, -99.9999, -999.9, -99.999, -99.999, -9999.999, -999.9, -999.9, -999.9, -999.9, -999.9999, -99.999, -9999.999, -999.99, -999.99, -999.99, -999.99, -999.9999, -99.9999, -9999.999, -999.99, -999.99, -999.99, -999.99, -999.9999, -99.9999, -9999.999, -999.9999, -99.9999, -9999.999, -999.9999, -99.9999, -9999.999, np.nan, np.nan, np.nan, -99, ] # open with pandas df = pd.read_csv( local_file, names=col_names, sep=r"\s+", header=None, na_values=na_col, # type: ignore ) ids = df["id"].to_numpy() traj, rowsize = np.unique(ids, return_counts=True) attrs_global = { "title": "ANDRO: An Argo-based deep displacement dataset (Quality controlled data)", "history": f"Dataset updated on {ANDRO_VERSION}", "date_created": datetime.now().isoformat(), "publisher_name": "SEANOE (SEA scieNtific Open data Edition)", "publisher_url": "https://www.seanoe.org/data/00360/47077/", "license": "Creative Commons Attribution 4.0 International License (http://creativecommons.org/licenses/by/4.0/)", } vars_attrs = { "rowsize": { "long_name": "Number of observations for each trajectory", "sample_dimension": "obs", "units": "-", }, "lon_d": { "long_name": "Longitude of the location where the deep velocity is calculated", "units": "degrees_east", }, "lat_d": { "long_name": "Latitude of the location where the deep velocity is calculated", "units": "degrees_north", }, "pres_d": { "long_name": "Reference parking pressure for this cycle", "units": "dbar", }, "temp_d": { "long_name": "Parking temperature (°C) for this cycle", "units": "degree_C", }, "sal_d": { "long_name": "Parking salinity for this cycle", "units": "psu", }, "time_d": { "long_name": "Julian time (days) when deep velocity is estimated", "units": "days since 2000-01-01 00:00", }, "ve_d": { "long_name": "Eastward component of the deep velocity", "units": "cm s-1", }, "vn_d": { "long_name": "Northward component of the deep velocity", "units": "cm s-1", }, "err_ve_d": { "long_name": "Error on the eastward component of the deep velocity", "units": "cm s-1", }, "err_vn_d": { "long_name": "Error on the northward component of the deep velocity", "units": "cm s-1", }, "lon_s": { "long_name": "Longitude of the location where the first surface velocity is calculated (over the first 6 h at surface)", "units": "degrees_east", }, "lat_s": { "long_name": "Latitude of the location where the first surface velocity is calculated", "units": "degrees_north", }, "time_s": { "long_name": "Julian time (days) when the first surface velocity is calculated", "units": "days since 2000-01-01 00:00", }, "ve_s": { "long_name": "Eastward component of first surface velocity", "units": "cm s-1", }, "vn_s": { "long_name": "Northward component of first surface velocity", "units": "cm s-1", }, "err_ve_s": { "long_name": "Error on the eastward component of the first surface velocity", "units": "cm s-1", }, "err_vn_s": { "long_name": "Error on the northward component of the first surface velocity", "units": "cm s-1", }, "lon_ls": { "long_name": "Longitude of the location where the last surface velocity is calculated (over the last 6 h at surface)", "units": "degrees_east", }, "lat_ls": { "long_name": "Latitude of the location where the last surface velocity is calculated", "units": "degrees_north", }, "time_ls": { "long_name": "Julian time (days) when the last surface velocity is calculated", "units": "days since 2000-01-01 00:00", }, "ve_ls": { "long_name": "Eastward component of last surface velocity (cm s-1)", "units": "cm s-1", }, "vn_ls": { "long_name": "Northward component of last surface velocity (cm s-1)", "units": "cm s-1", }, "err_ve_ls": { "long_name": "Error on the eastward component of the last surface velocity", "units": "cm s-1", }, "err_vn_ls": { "long_name": "Error on the northward component of the last surface velocity", "units": "cm s-1", }, "lon_lp": { "long_name": "Longitude of the last fix at the sea surface during the previous cycle", "units": "degrees_east", }, "lat_lp": { "long_name": "Latitude of the last fix at the sea surface during the previous cycle", "units": "degrees_north", }, "time_lp": { "long_name": "Julian time of the last fix at the sea surface during the previous cycle", "units": "days since 2000-01-01 00:00", }, "lon_fc": { "long_name": "Longitude of the first fix at the sea surface during the current cycle", "units": "degrees_east", }, "lat_fc": { "long_name": "Latitude of the first fix at the sea surface during the current cycle", "units": "degrees_north", }, "time_fc": { "long_name": "Julian time of the first fix at the sea surface during the current cycle", "units": "days since 2000-01-01 00:00", }, "lon_lc": { "long_name": "Longitude of the last fix at the sea surface during the current cycle", "units": "degrees_east", }, "lat_lc": { "long_name": "Latitude of the last fix at the sea surface during the current cycle", "units": "degrees_north", }, "time_lc": { "long_name": "Julian time of the last fix at the sea surface during the current cycle", "units": "days since 2000-01-01 00:00", }, "surf_fix": { "long_name": "Number of surface fixes during the current cycle", "units": "-", }, "id": { "long_name": "Float WMO number", "units": "-", }, "cycle": { "long_name": "Cycle number", "units": "-", }, "profile_id": { "long_name": "Profile number as given in the NetCDF prof file", "units": "-", }, } attrs_variables = { k: v for k, v in vars_attrs.items() if k in df.columns or k in ["id", "rowsize"] } # Preserve historical dtype behavior: keep lon/lat in float64 and downcast # other float64 variables to float32 to limit memory usage. double_vars = { "lat_d", "lon_d", "lat_s", "lon_s", "lat_ls", "lon_ls", "lat_lp", "lon_lp", "lat_fc", "lon_fc", "lat_lc", "lon_lc", } # Only keep columns that are in the DataFrame data_vars = {} for k in [name for name in df.columns if name != "id"]: values = df[k].to_numpy() if values.dtype == np.float64 and k not in double_vars: values = values.astype(np.float32) data_vars[k] = values # Extract time coordinates from data_vars time_coords = {} time_coord_names = ["time_d", "time_s", "time_lp", "time_lc", "time_fc"] for name in time_coord_names: if name in data_vars: time_coords[name] = data_vars.pop(name) return RaggedArray( coords={ "id": traj, **time_coords, }, metadata={ "rowsize": rowsize.astype("int64"), }, data=data_vars, attrs_global=attrs_global, attrs_variables=attrs_variables, name_dims={"traj": "rows", "obs": "obs"}, coord_dims={ "id": "traj", **{name: "obs" for name in time_coords.keys()}, }, var_dims={ "rowsize": ["traj"], **{k: ["obs"] for k in data_vars.keys()}, }, )