Source code for zmap.reference.load_h5ad

import os
import pathlib
import urllib.request
import tempfile

import anndata as ad
import scanpy as sc
from tqdm import tqdm

# --------------------------------------------------------------------
# Registry of known H5ADs (fill in your real URLs & filenames)
# --------------------------------------------------------------------

H5AD_SOURCES = {
    # Raw counts
    "raw": {
        "url": "https://pub-dbadc2c623224cb58d93cfa3b950fef5.r2.dev/h5ad/ZMAP_250402_raw.h5ad",
        "filename": "ZMAP_250402_raw.h5ad",
    },
    # Fully processed (full dataset + intermediate files)
    "processed": {
        "url": "https://pub-dbadc2c623224cb58d93cfa3b950fef5.r2.dev/h5ad/ZMAP_251209_processed.h5ad",
        "filename": "ZMAP_251209_processed.h5ad",
    },
    # Fully processed but raw counts only
    "processed_slim": {
        "url": "https://pub-dbadc2c623224cb58d93cfa3b950fef5.r2.dev/h5ad/ZMAP_251209_processed_slim.h5ad",
        "filename": "ZMAP_251209_processed_slim.h5ad",
    },
    # Fully processed but tpm counts only (best for plotting)
    "processed_slim_tpm": {
        "url": "https://pub-dbadc2c623224cb58d93cfa3b950fef5.r2.dev/h5ad/ZMAP_251209_processed_slim_tpm.h5ad",
        "filename": "ZMAP_251209_processed_slim_tpm.h5ad",
    },
    # Processed slim / symphony reference
    "symphony": {
        "url": "https://pub-dbadc2c623224cb58d93cfa3b950fef5.r2.dev/h5ad/ZMAP_260103_symphony.h5ad",
        "filename": "ZMAP_260103_symphony.h5ad",
    },
}

# In-memory cache per (kind, url, backed)
_H5AD_CACHE: dict[tuple[str, str | None, bool], ad.AnnData] = {}


# --------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------

def _default_h5ad_dir() -> pathlib.Path:
    """
    Default directory to store / cache H5ADs.

    Uses Google Drive when available (/content/drive/MyDrive/zmap/h5ad),
    so files persist across Colab sessions. Falls back to <cwd>/zmap/h5ad
    if Drive is not mounted.
    """
    drive_path = pathlib.Path("/content/drive/MyDrive/zmap/h5ad")
    if drive_path.parent.exists():
        drive_path.mkdir(parents=True, exist_ok=True)
        return drive_path

    print(
        "[ZMAP] Google Drive not detected at /content/drive/MyDrive — "
        "using local cache at <cwd>/zmap/h5ad. "
        "Mount Drive and re-run to enable persistent caching."
    )
    fallback = pathlib.Path.cwd() / "zmap" / "h5ad"
    fallback.mkdir(parents=True, exist_ok=True)
    return fallback


def _open_url(url: str):
    """Open a URL with a browser-like User-Agent so Cloudflare is happy."""
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (compatible; ZMAP/0.1; +https://example.org)"
        },
    )
    return urllib.request.urlopen(req)


def _stream_download(
    url: str,
    dest_path: pathlib.Path,
    *,
    chunk_size: int = 1 << 20,  # 1 MB
    show_progress: bool = True,
):
    """
    Stream a file from URL to dest_path with an optional tqdm progress bar.
    """
    print(f"[ZMAP] Downloading {url}{dest_path}")
    resp = _open_url(url)
    total = None
    try:
        total = int(resp.headers.get("Content-Length", "0")) or None
    except Exception:
        total = None

    if show_progress:
        pbar = tqdm(
            total=total,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            desc="Downloading",
        )
    else:
        pbar = None

    with dest_path.open("wb") as out:
        while True:
            block = resp.read(chunk_size)
            if not block:
                break
            out.write(block)
            if pbar is not None:
                pbar.update(len(block))

    if pbar is not None:
        pbar.close()
    resp.close()


# --------------------------------------------------------------------
# Public API
# --------------------------------------------------------------------

[docs] def download_zmap_h5ad( *, kind: str | None = "processed_slim_tpm", url: str | None = None, dest_dir: str | os.PathLike | None = None, filename: str | None = None, write_to_disk: bool = True, force_download: bool = False, chunk_size: int = 1 << 20, show_progress: bool = True, ) -> pathlib.Path: """ Download a ZMAP H5AD file from the CDN, with local caching. Downloads the file to a persistent cache directory (Google Drive when available, otherwise a local directory). Subsequent calls with the same ``kind`` skip the download if the file already exists on disk. Most users should prefer :func:`load_zmap_h5ad`, which calls this function internally and also handles loading and preprocessing. Parameters ---------- kind : str or None, default ``"processed_slim_tpm"`` Preset dataset key. One of the keys in :data:`H5AD_SOURCES` (``"raw"``, ``"processed"``, ``"processed_slim"``, ``"processed_slim_tpm"``, ``"symphony"``). Ignored when ``url`` is provided. url : str or None, default ``None`` Explicit download URL. Overrides the registry URL looked up via ``kind``. dest_dir : path-like or None, default ``None`` Directory to store the downloaded file. Defaults to ``/content/drive/MyDrive/zmap/h5ad`` when Google Drive is mounted, or ``<cwd>/zmap/h5ad`` otherwise. filename : str or None, default ``None`` Override the filename used when saving to disk. Inferred from the registry or URL when not provided. write_to_disk : bool, default ``True`` If ``False``, downloads to a temporary file that is not kept after loading. force_download : bool, default ``False`` Re-download the file even if it already exists on disk. chunk_size : int, default ``1 << 20`` (1 MB) Download chunk size in bytes. show_progress : bool, default ``True`` Display a ``tqdm`` progress bar during download. Returns ------- pathlib.Path Path to the downloaded (or cached) H5AD file on disk. Raises ------ ValueError If no URL can be resolved from ``kind`` or ``url``. Examples -------- >>> path = zmap.ref.download_zmap_h5ad() >>> path = zmap.ref.download_zmap_h5ad(kind="symphony") >>> path = zmap.ref.download_zmap_h5ad(url="https://.../my.h5ad") """ meta = H5AD_SOURCES.get(kind or "", {}) if url is None else {} final_url = url or meta.get("url") if final_url is None: raise ValueError(f"No URL provided and no registry entry for kind={kind!r}") if dest_dir is None: dest_dir_path = _default_h5ad_dir() else: dest_dir_path = pathlib.Path(dest_dir) dest_dir_path.mkdir(parents=True, exist_ok=True) if filename is not None: fname = filename elif "filename" in meta: fname = meta["filename"] else: fname = pathlib.Path(urllib.request.urlparse(final_url).path).name or "zmap.h5ad" dest_path = dest_dir_path / fname if not write_to_disk: tmp = tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) dest_path = pathlib.Path(tmp.name) tmp.close() _stream_download(final_url, dest_path, chunk_size=chunk_size, show_progress=show_progress) return dest_path if dest_path.exists() and not force_download: print(f"[ZMAP] Using cached file: {dest_path}") return dest_path _stream_download(final_url, dest_path, chunk_size=chunk_size, show_progress=show_progress) return dest_path
[docs] def preprocess_tpmlog(adata: ad.AnnData): """ Add a ``tpm_log`` layer by normalizing raw counts to TPM + log1p. Checks whether ``adata.layers["raw_nolog"]`` exists and ``adata.layers["tpm_log"]`` does not. When both conditions are met, performs library-size normalization to counts per million followed by ``log1p``, and stores the result as ``adata.layers["tpm_log"]``. This is a convenience function called automatically by :func:`load_zmap_h5ad` when ``attempt_preprocess_tpmlog=True``. Parameters ---------- adata : anndata.AnnData The dataset to preprocess. Modified in-place. Notes ----- After this call, ``adata.X`` is cleared (set to ``None``) so that downstream code explicitly selects a layer rather than relying on a stale ``.X`` matrix. """ if "raw_nolog" in adata.layers and "tpm_log" not in adata.layers: print("[ZMAP] Computing 'tpm_log' from 'raw_nolog' (normalize + log1p)") adata.X = adata.layers["raw_nolog"] sc.pp.normalize_total(adata, target_sum=1e6, inplace=True) sc.pp.log1p(adata) adata.layers["tpm_log"] = adata.X del adata.X
# -------------------------------------------------------------------- # High Level Wrapper (download if needed, load, preprocess) # --------------------------------------------------------------------
[docs] def load_zmap_h5ad( *, kind: str | None = "processed_slim_tpm", url: str | None = None, dest_dir: str | os.PathLike | None = None, filename: str | None = None, write_to_disk: bool = True, use_cache: bool = True, force_download: bool = False, backed: bool | str = False, chunk_size: int = 1 << 20, show_progress: bool = True, attempt_preprocess_tpmlog: bool = True, ) -> ad.AnnData: """ Load a ZMAP reference dataset into memory, downloading it if necessary. This is the primary entry point for accessing ZMAP reference data. On first call the file is downloaded and cached to Google Drive (if mounted) or a local directory. Subsequent calls in the same session are served from an in-memory cache and return instantly. Load priority (fastest to slowest): 1. In-memory session cache — instantaneous, no I/O. 2. File already on disk (Drive or local) — fast, no download. 3. Fresh download from the ZMAP CDN. Parameters ---------- kind : str or None, default ``"processed_slim_tpm"`` Preset dataset to load. One of: - ``"processed_slim_tpm"`` — fully processed, TPM counts only. Best default for visualization and label transfer. - ``"processed_slim"`` — fully processed, raw counts only. - ``"processed"`` — fully processed, includes intermediate layers. - ``"raw"`` — raw counts, unprocessed. - ``"symphony"`` — Symphony reference used for query embedding. Required for ``annotate_with_zmap``. Ignored when ``url`` is provided. url : str or None, default ``None`` Explicit download URL. Overrides ``kind``. Use this to load a custom or external H5AD not in the ZMAP registry. dest_dir : path-like or None, default ``None`` Directory where the H5AD file is saved. Defaults to ``/content/drive/MyDrive/zmap/h5ad`` when Google Drive is mounted, or ``<cwd>/zmap/h5ad`` otherwise. filename : str or None, default ``None`` Override the filename used when saving to disk. Inferred from the registry or URL when not provided. write_to_disk : bool, default ``True`` If ``False``, downloads to a temporary file that is deleted after loading. Useful for one-off loads when disk space is constrained. Incompatible with ``backed=True``. use_cache : bool, default ``True`` If ``True``, return the cached in-memory object on repeat calls. Set to ``False`` to force a fresh load from disk (e.g. after modifying the file externally). force_download : bool, default ``False`` Re-download the file even if it already exists on disk. backed : bool or str, default ``False`` Open the H5AD in backed (memory-mapped) mode. Pass ``True`` for read-only (``"r"``), or a mode string (e.g. ``"r+"``) for read-write. Backed mode avoids loading the full matrix into RAM but is slower for random access. Requires ``write_to_disk=True``. chunk_size : int, default ``1 << 20`` (1 MB) Download chunk size in bytes. show_progress : bool, default ``True`` Display a ``tqdm`` progress bar while downloading. attempt_preprocess_tpmlog : bool, default ``True`` If the loaded object has a ``raw_nolog`` layer but no ``tpm_log`` layer, compute ``tpm_log`` via TPM normalization + log1p and add it as a layer. Has no effect if ``tpm_log`` is already present or if ``backed=True``. Returns ------- anndata.AnnData The loaded reference dataset. Examples -------- >>> adata_ref = zmap.ref.load_zmap_h5ad() # default: processed_slim_tpm >>> adata_ref = zmap.ref.load_zmap_h5ad(kind="symphony") # for annotate_with_zmap >>> adata_ref = zmap.ref.load_zmap_h5ad(url="https://.../my.h5ad", filename="my.h5ad") """ if backed and not write_to_disk: print("[ZMAP] backed=True requires write_to_disk=True; overriding.") write_to_disk = True # ------------------------------------------------------------------ # 1) In-memory cache (fastest — same session) # ------------------------------------------------------------------ cache_key = (kind or "custom", url, bool(backed)) if use_cache and cache_key in _H5AD_CACHE: return _H5AD_CACHE[cache_key] # ------------------------------------------------------------------ # 2) Download file if needed (no-op if already on Drive) # ------------------------------------------------------------------ path = download_zmap_h5ad( kind=kind, url=url, dest_dir=dest_dir, filename=filename, write_to_disk=write_to_disk, force_download=force_download, chunk_size=chunk_size, show_progress=show_progress, ) # ------------------------------------------------------------------ # 3) Load # ------------------------------------------------------------------ if backed: backed_mode = "r" if backed is True else backed print(f"[ZMAP] Loading (backed={backed_mode!r}) from {path}") adata = sc.read_h5ad(path, backed=backed_mode) else: print(f"[ZMAP] Loading into memory from {path}") adata = ad.read_h5ad(path) # ------------------------------------------------------------------ # 4) Optional preprocessing # ------------------------------------------------------------------ if attempt_preprocess_tpmlog and not backed: preprocess_tpmlog(adata) # ------------------------------------------------------------------ # 5) Clean up if not keeping persistent copy # ------------------------------------------------------------------ if not write_to_disk: try: os.unlink(path) except OSError: pass if use_cache and not backed: _H5AD_CACHE[cache_key] = adata return adata