Source code for bsv.fetch_connectivity_data

import os
import json
import numpy as np
import pandas as pd

from .fetch_connectivity_summary import fetch_connectivity_summary, load_injection_summary
from .fetch_connectivity_images import fetch_connectivity_images, PROJECTION_GRID_SIZE
from .atlas_utils import load_projection_info



[docs]
def fetch_connectivity_data(experiment_ids, save_location, file_name,
                            normalization_method, subtract_other_hemisphere,
                            grouping_method='', allen_atlas_path='',
                            load_all=False, input_regions=None, region_groups=None,
                            export_metadata=True, reload=False,
                            atlas_type='allen', atlas_resolution=10,
                            grouping_bins=5):
    """Download and cache projection density maps for a set of experiments.

    Parameters
    ----------
    experiment_ids : list of int
        Experiment IDs returned by :func:`find_connectivity_experiments`.
    save_location : str
        Local directory for cached downloads.
    file_name : str
        Base name for the cached metadata CSV. ``''`` to skip caching.
    normalization_method : str
        ``'none'`` or ``'injectionIntensity'`` (divide by injection volume).
    subtract_other_hemisphere : bool
        Subtract the contralateral hemisphere signal.
    grouping_method : str, optional
        Group experiments by ``'AP'``, ``'ML'``, ``'DV'``, or ``''`` (no
        grouping). For ``'AP'``/``'ML'``/``'DV'`` the continuous injection-site
        coordinate is binned into *grouping_bins* equal-width levels.
    grouping_bins : int, optional
        Number of equal-width bins (display rows) to split the injection
        coordinate into when *grouping_method* is ``'AP'``/``'ML'``/``'DV'``.
        Default is 5. Empty bins are dropped.
    allen_atlas_path : str, optional
        Path to the Allen CCF atlas directory.
    load_all : bool, optional
        If True, also return individual (non-averaged) projection volumes.
    input_regions : list of str, optional
        Region acronyms for region-based grouping.
    region_groups : list of int, optional
        Group assignment per region (same length as *input_regions*).
    export_metadata : bool, optional
        Write a per-experiment metadata CSV.
    reload : bool, optional
        Re-download even if cached files exist.
    atlas_type : str, optional
        Atlas type (default ``'allen'``).
    atlas_resolution : int, optional
        Atlas resolution in micrometres (10 or 20).

    Returns
    -------
    combined_projection : numpy.ndarray
        Averaged projection density array (AP x DV x ML x groups).
    combined_injection_info : dict
        Aggregated injection metadata across experiments.
    individual_projections : numpy.ndarray or None
        Per-experiment volumes when *load_all* is True, else None.
    experiment_region_info : dict
        Per-experiment region metadata.
    """
    if not grouping_method:
        grouping_method = 'NaN'
    if input_regions is None:
        input_regions = []
    if region_groups is None:
        region_groups = []

    os.makedirs(save_location, exist_ok=True)

    # Load projection info CSV
    projection_info = load_projection_info()

    n_exp = len(experiment_ids)
    primary_structure_AP = np.zeros(n_exp)
    primary_structure_DV = np.zeros(n_exp)
    primary_structure_ML = np.zeros(n_exp)
    primary_structure_ID = np.zeros(n_exp, dtype=int)
    primary_structure_abbreviation = [''] * n_exp

    # Collect all injection info
    combined_injection_info = {}

    print(f'Loading {n_exp} experiments...')

    for i, exp_id in enumerate(experiment_ids):
        save_dir = os.path.join(save_location, str(exp_id))
        os.makedirs(save_dir, exist_ok=True)

        # Fetch summary if needed
        summary_path = os.path.join(save_dir, 'injectionSummary_all.json')
        if not os.path.exists(summary_path):
            status = fetch_connectivity_summary(exp_id, save_dir)
            if not status:
                continue

        injection_info = load_injection_summary(save_dir)

        # Accumulate injection info
        for entry in injection_info:
            for key, val in entry.items():
                if key not in combined_injection_info:
                    combined_injection_info[key] = []
                combined_injection_info[key].append(val)

        # Get metadata from projection info CSV
        row = projection_info[projection_info['id'] == exp_id]
        if len(row) > 0:
            row = row.iloc[0]
            primary_structure_ID[i] = row['structure_id']
            primary_structure_abbreviation[i] = row['structure_abbrev']
            coords_str = str(row['injection_coordinates'])
            coords = [float(x) for x in coords_str.strip('[]').split(',')]
            if len(coords) >= 3:
                primary_structure_AP[i] = coords[0]
                primary_structure_DV[i] = coords[1]
                primary_structure_ML[i] = coords[2]

    # Print experiment summary
    exp_info = projection_info[projection_info['id'].isin(experiment_ids)]
    print(f'\nEXPERIMENT SUMMARY')
    print(f'Total experiments loaded: {n_exp}')

    print(f'\nMouse genotype distribution:')
    genotypes = exp_info['transgenic_line'].fillna('Wild-type').replace({'': 'Wild-type', '""': 'Wild-type'})
    for gt, count in genotypes.value_counts().items():
        print(f'  {gt}: {count} experiments')

    print(f'\nBrain region distribution:')
    for reg, count in exp_info['structure_abbrev'].value_counts().items():
        print(f'  {reg}: {count} experiments')

    # Grouping
    group_centers = None   # mean injection coordinate per group (for row labels)
    grouping_axis = None   # 'AP' / 'ML' / 'DV' when grouping by injection location
    if grouping_method in ('AP', 'ML', 'DV'):
        coord_map = {'AP': primary_structure_AP, 'ML': primary_structure_ML, 'DV': primary_structure_DV}
        vals = np.asarray(coord_map[grouping_method], dtype=float)
        # An injection coordinate of 0 means it could not be parsed from the CSV.
        finite = np.isfinite(vals) & (vals != 0)
        n_bins = max(1, int(grouping_bins))
        group_id = np.zeros(n_exp, dtype=int)
        if finite.any() and n_bins > 1 and vals[finite].max() > vals[finite].min():
            # Bin the continuous injection coordinate into equal-width levels
            # (anterior->posterior for AP) so each level becomes one display row.
            edges = np.linspace(vals[finite].min(), vals[finite].max(), n_bins + 1)
            bin_idx = np.clip(np.digitize(vals[finite], edges) - 1, 0, n_bins - 1)
            group_id[finite] = bin_idx + 1
            # Drop empty bins and renumber 1..n_groups so there are no blank rows.
            present = sorted(set(group_id[group_id > 0].tolist()))
            remap = {g: i + 1 for i, g in enumerate(present)}
            group_id = np.array([remap.get(g, 0) for g in group_id], dtype=int)
            n_groups = len(present)
        else:
            group_id[:] = 1
            n_groups = 1
        # Exact center of each group = mean injection coordinate of its experiments.
        grouping_axis = grouping_method
        group_centers = np.array([
            float(np.mean(vals[group_id == g + 1])) if np.any(group_id == g + 1) else np.nan
            for g in range(n_groups)
        ])
        print(f'Grouping by {grouping_method} into {n_groups} bin(s) '
              f'({int((group_id > 0).sum())} experiments)')
    elif grouping_method == 'NaN' or not grouping_method:
        group_id = np.ones(n_exp, dtype=int)
        n_groups = 1
    else:
        print(f'Warning: grouping method "{grouping_method}" not recognized - skipping grouping.')
        group_id = np.ones(n_exp, dtype=int)
        n_groups = 1

    # Region-based grouping override
    if input_regions and region_groups:
        unique_rg = sorted(set(region_groups))
        n_region_groups = len(unique_rg)
        rg_cell = {g: [j for j, rg in enumerate(region_groups) if rg == g] for g in unique_rg}

        region_based_group_id = np.zeros(n_exp, dtype=int)
        for i in range(n_exp):
            abbrev = primary_structure_abbreviation[i]
            region_idx = -1
            for j, reg in enumerate(input_regions):
                if abbrev == reg or abbrev.startswith(reg):
                    region_idx = j
                    break
            if region_idx >= 0:
                for g_idx, g in enumerate(unique_rg):
                    if region_idx in rg_cell[g]:
                        region_based_group_id[i] = g_idx + 1
                        break

        n_groups = n_region_groups
        group_id = region_based_group_id
        group_centers = None   # region grouping is labelled by region name, not coordinate
        grouping_axis = None
        print(f'Region-based grouping: {np.sum(group_id > 0)} experiments mapped to {n_groups} groups')

    # Initialize combined projection
    combined_projection = np.zeros((*PROJECTION_GRID_SIZE, n_groups))
    individual_projections = np.zeros((*PROJECTION_GRID_SIZE, n_exp)) if load_all else None

    # Load raw images and accumulate
    print('Getting raw images...')
    for i, exp_id in enumerate(experiment_ids):
        curr_group = group_id[i]
        if curr_group == 0:
            continue

        raw_path = os.path.join(save_location, str(exp_id), 'density.raw')
        if not os.path.exists(raw_path):
            status = fetch_connectivity_images(exp_id, os.path.join(save_location, str(exp_id)))
            if not status:
                continue

        # Read density.raw - MATLAB reads column-major (Fortran order)
        experiment_projection = np.fromfile(raw_path, dtype='<f4').reshape(PROJECTION_GRID_SIZE, order='F')

        # Normalize by injection volume (mm^3).
        # The density.raw values are already fractional (0-1), so dividing
        # by injection volume makes experiments with different injection
        # sizes comparable.
        if normalization_method in ('injectionVolume', 'injectionIntensity'):
            exp_indices = np.array(combined_injection_info.get('experimentID', [])) == exp_id
            struct_indices = exp_indices & (np.array(combined_injection_info.get('structure_id', [])) == 997)

            hem_ids = np.array(combined_injection_info.get('hemisphere_id', []))
            hem3 = struct_indices & (hem_ids == 3)

            vol_arr = np.array(combined_injection_info.get('projection_volume', []), dtype=float)
            inj_vol = vol_arr[hem3].max() if hem3.any() else 0

            if inj_vol == 0:
                print(f'Warning: Invalid injection volume for experiment {exp_id}, skipping normalization')
                inj_vol = 1

            experiment_projection = experiment_projection / inj_vol

        # Subtract other hemisphere
        if subtract_other_hemisphere:
            injection_info = load_injection_summary(os.path.join(save_location, str(exp_id)))
            max_voxel_z = injection_info[0].get('max_voxel_z', 57)
            tmp = np.zeros_like(experiment_projection)
            if max_voxel_z <= 57:  # left hemisphere
                for ml in range(57):
                    tmp[:, :, ml] = experiment_projection[:, :, ml] - experiment_projection[:, :, 113 - ml]
            else:  # right hemisphere
                for ml in range(57):
                    tmp[:, :, 113 - ml] = experiment_projection[:, :, 113 - ml] - experiment_projection[:, :, ml]
            experiment_projection = tmp

        if load_all:
            individual_projections[:, :, :, i] = experiment_projection
        combined_projection[:, :, :, curr_group - 1] += experiment_projection

    # Average by group count
    for g in range(n_groups):
        count = np.sum(group_id == g + 1)
        if count > 0:
            combined_projection[:, :, :, g] /= count

    # Build experiment region info
    experiment_region_info = {
        'experimentIDs': experiment_ids,
        'primaryStructure_ID': primary_structure_ID,
        'primaryStructure_abbreviation': primary_structure_abbreviation,
        'primaryStructure_AP': primary_structure_AP,
        'primaryStructure_DV': primary_structure_DV,
        'primaryStructure_ML': primary_structure_ML,
        'group_centers': group_centers,
        'grouping_axis': grouping_axis,
    }

    # Export metadata CSV
    if export_metadata:
        csv_base = file_name if file_name else 'connectivity_data'
        meta = pd.DataFrame()
        meta['ExperimentID'] = experiment_ids

        exp_info_ordered = projection_info[projection_info['id'].isin(experiment_ids)]
        idx_map = {eid: i for i, eid in enumerate(experiment_ids)}
        exp_info_ordered = exp_info_ordered.copy()
        exp_info_ordered['_sort'] = exp_info_ordered['id'].map(idx_map)
        exp_info_ordered = exp_info_ordered.sort_values('_sort').drop(columns='_sort')

        meta['MouseLine'] = exp_info_ordered['transgenic_line'].fillna('Wild-type').replace(
            {'': 'Wild-type', '""': 'Wild-type'}).values
        meta['InjectionRegion'] = primary_structure_abbreviation
        meta['InjectionRegionID'] = primary_structure_ID
        meta['InjectionAP'] = primary_structure_AP
        meta['InjectionDV'] = primary_structure_DV
        meta['InjectionML'] = primary_structure_ML

        # Injection volume/intensity from combined info
        inj_volumes = np.zeros(n_exp)
        inj_intensities = np.zeros(n_exp)
        exp_id_arr = np.array(combined_injection_info.get('experimentID', []))
        struct_id_arr = np.array(combined_injection_info.get('structure_id', []))
        hem_id_arr = np.array(combined_injection_info.get('hemisphere_id', []))
        pv_arr = np.array(combined_injection_info.get('projection_volume', []), dtype=float)
        spi_arr = np.array(combined_injection_info.get('sum_pixel_intensity', []), dtype=float)

        for j, eid in enumerate(experiment_ids):
            mask = (exp_id_arr == eid) & (struct_id_arr == 997) & (hem_id_arr == 3)
            if mask.any():
                inj_volumes[j] = pv_arr[mask].max()
                inj_intensities[j] = spi_arr[mask].max()

        meta['InjectionVolume'] = inj_volumes
        meta['InjectionIntensity'] = inj_intensities

        if 'sex' in exp_info_ordered.columns:
            meta['Sex'] = exp_info_ordered['sex'].values
        if 'age' in exp_info_ordered.columns:
            meta['Age'] = exp_info_ordered['age'].values

        csv_path = os.path.join(save_location, f'{csv_base}_metadata.csv')
        meta.to_csv(csv_path, index=False)
        print(f'Metadata exported to: {csv_path}')

    print('Data processing complete.')
    return combined_projection, combined_injection_info, individual_projections, experiment_region_info