Source code for dandi_compute_code.dandiset._load_assets_jsonld_metadata
import dataclasses
import functools
import json
import logging
import urllib.request
from ._globals import _ASSETS_JSONLD_URL
_log = logging.getLogger(__name__)
[docs]
@dataclasses.dataclass(frozen=True)
class AssetMetadata:
"""Minimal indexed metadata for one asset path."""
path: str
date_modified: str
content_size: int
content_id: str
[docs]
@dataclasses.dataclass(frozen=True)
class AssetsJsonldMetadata:
"""Indexed metadata loaded from DANDI ``assets.jsonld``."""
content_id_to_asset: dict[str, dict[str, object]]
path_to_asset_metadata: dict[str, AssetMetadata]
def _build_asset_metadata(asset: dict[str, object]) -> tuple[str, AssetMetadata]:
"""Validate and extract metadata for a single asset; raises ValueError on missing fields."""
content_size = asset.get("contentSize")
if isinstance(content_size, str) and content_size.isdigit():
content_size = int(content_size)
asset["contentSize"] = content_size
if not isinstance(content_size, int):
raise ValueError(f"Asset missing valid 'contentSize': {asset!r}")
path = asset.get("path")
if not isinstance(path, str):
raise ValueError(f"Asset missing valid 'path': {asset!r}")
date_modified = asset.get("dateModified")
if not isinstance(date_modified, str):
raise ValueError(f"Asset {path!r} missing valid 'dateModified'")
content_urls = asset.get("contentUrl")
content_id = next(
(
url.rstrip("/").rsplit("/", 1)[-1].split("?", 1)[0]
for url in (content_urls if isinstance(content_urls, list) else [])
if isinstance(url, str) and ("/blobs/" in url or "/zarr/" in url)
),
None,
)
if not (isinstance(content_id, str) and content_id):
raise ValueError(f"Asset at path {path!r} has no blob or zarr contentUrl: {content_urls!r}")
return content_id, AssetMetadata(
path=path,
date_modified=date_modified,
content_size=content_size,
content_id=content_id,
)
[docs]
@functools.lru_cache(maxsize=1)
def load_assets_jsonld_metadata() -> AssetsJsonldMetadata:
"""
Load content-id and path metadata from the DANDI 001697 draft ``assets.jsonld`` stream.
:returns:
Indexed assets metadata.
:rtype: AssetsJsonldMetadata
"""
content_id_to_asset: dict[str, dict[str, object]] = {}
path_to_asset_metadata: dict[str, AssetMetadata] = {}
try:
with urllib.request.urlopen(_ASSETS_JSONLD_URL, timeout=30) as response:
assets = json.load(response)
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError) as exception:
_log.warning("Unable to load metadata from %s: %s", _ASSETS_JSONLD_URL, exception)
return AssetsJsonldMetadata(
content_id_to_asset=content_id_to_asset,
path_to_asset_metadata=path_to_asset_metadata,
)
if not isinstance(assets, list):
raise ValueError(f"Expected a JSON array from {_ASSETS_JSONLD_URL}, got {type(assets).__name__}")
for asset in assets:
if not isinstance(asset, dict):
raise ValueError(f"Expected each asset to be a dict, got {type(asset).__name__}: {asset!r}")
content_id, metadata = _build_asset_metadata(asset)
content_id_to_asset[content_id] = asset
path_to_asset_metadata[metadata.path] = metadata
return AssetsJsonldMetadata(
content_id_to_asset=content_id_to_asset,
path_to_asset_metadata=path_to_asset_metadata,
)