Source code for analysis.load

"""
Load data (from .rhd, .txt, .wav, etc)
"""
import numpy as np


[docs]def read_not_mat(notmat, unit="ms"):
    """
    Read from .not.mat files generated from uisonganal.m



    Parameters
    ----------
    notmat : path
        Name of the .not.mat file (path)
    unit : {'ms', 'second'}
        timescale

    Returns
    -------
    onsets : np.ndarray
        time stamp for syllable onset (in ms)
    offsets : np.ndarray
        time stamp for syllable offset (in ms)
    intervals : np.ndarray
        temporal interval between syllables (i.e. syllable gaps) (in ms)
    durations : np.ndarray
        durations of each syllable (in ms)
    syllables : str
        song syllables
    contexts : str
        social context ('U' for undirected and 'D' for directed)
    """
    import scipy.io

    onsets = scipy.io.loadmat(notmat)["onsets"].transpose()[
        0
    ]  # syllable onset timestamp
    offsets = scipy.io.loadmat(notmat)["offsets"].transpose()[
        0
    ]  # syllable offset timestamp
    intervals = onsets[1:] - offsets[:-1]  # syllable gap durations (interval)
    durations = offsets - onsets  # duration of each syllable
    syllables = scipy.io.loadmat(notmat)["syllables"][0]  # Load the syllable info
    contexts = (
        notmat.name.split(".")[0].split("_")[-1][0].upper()
    )  # extract 'U' (undirected) or 'D' (directed) from the file name
    if contexts not in ["U", "D"]:  # if the file was not tagged with Undir or Dir
        contexts = None

    # units are in ms by default, but convert to second with the argument
    if unit == "second":
        onsets /= 1e3
        offsets /= 1e3
        intervals /= 1e3
        durations /= 1e3

    return onsets, offsets, intervals, durations, syllables, contexts


[docs]def read_spk_txt(spk_txt_file, *unit_nb, time_unit="second"):
    """
    Read the output .txt from the Offline Sorter.
    column header of the input .txt -> ['Channel', 'Unit', 'Timestamp']
    disregard the first column since it is always 1
    column 3 to 35 stores waveforms

    Parameters
    ----------
    spk_txt_file : str
        Name of the spk txt file
    unit_nb : int
        Number of the sorted unit. If not specified (default), it will read data from all recorded units.
    time_unit :

    Returns
    -------
    spk_ts : np.ndarray
        Spike timestamps
    spk_waveform : np.ndarray
        Spike waveform (spk id x waveform)
    nb_spk : int
        Number of spikes
    """

    spk_info = np.loadtxt(spk_txt_file, delimiter="\t", skiprows=1)  # skip header

    # Select only the unit (there could be multiple isolated units in the same file)
    if unit_nb:  # if the unit number is specified
        spk_info = spk_info[spk_info[:, 1] == unit_nb, :]

    spk_ts = spk_info[:, 2]  # analysis time stamps
    spk_waveform = spk_info[:, 3:]  # analysis waveform
    nb_spk = spk_waveform.shape[0]  # total number of spikes

    # units are in second by default, but convert to  millisecond with the argument
    if time_unit == "ms":
        spk_ts *= 1e3

    return spk_ts, spk_waveform, nb_spk


[docs]def read_rhd(filename):
    """
    Reads Intan Technologies RHD2000 data file generated by evaluation board GUI.

    Data are returned in a dictionary, for future extensibility.
    """
    from ..utils.intan.load_intan_rhd_format import read_rhd as _read_rhd

    intan = _read_rhd(filename)
    return intan


[docs]def load_song(data_path, format="wav") -> dict:
    """
    Obtain event info & serialized timestamps for song & neural analysis

    Search all files in the sub-directory and read from the associated .not.mat files to add the info into a single files

    Parameters
    ----------
    data_path : path

    format : str
        file extension (e.g., '.wav')
    """
    from scipy.io import wavfile

    from ..analysis.functions import demarcate_bout
    from ..utils.functions import list_files

    # List all audio files in the dir
    if not data_path.stem == "Songs":
        song_dir = [
            folder for folder in data_path.rglob("Songs")
        ]  # find the folder that has song data (not calls)
    else:
        song_dir = [data_path]

    audio_files = []
    for data_dir in song_dir:
        audio_files += list_files(data_dir, format)

    # Initialize
    timestamp_serialized = np.array([], dtype=np.float32)

    # Store values in these lists
    file_list = []
    file_start_list = []
    file_end_list = []
    onset_list = []
    offset_list = []
    duration_list = []
    syllable_list = []
    context_list = []

    # Loop through Intan .rhd files
    for file in audio_files:

        # Load audio files
        print("Loading... " + file.stem)
        sample_rate, data = wavfile.read(file)  # note that the timestamp is in second
        length = data.shape[0] / sample_rate
        timestamp = (
            np.linspace(0.0, length, data.shape[0]) * 1e3
        )  # start from t = 0 in ms

        # Load the .not.mat file
        notmat_file = file.with_suffix(".wav.not.mat")
        onsets, offsets, intervals, durations, syllables, contexts = read_not_mat(
            notmat_file, unit="ms"
        )
        start_ind = timestamp_serialized.size  # start of the file

        if timestamp_serialized.size:
            timestamp += timestamp_serialized[-1] + (1 / sample_rate)
        timestamp_serialized = np.append(timestamp_serialized, timestamp)

        # File information (name, start & end timestamp of each file)
        # file_list.append(os.path.relpath(file, ProjectLoader().path))
        file_list.append(file.stem)
        file_start_list.append(timestamp_serialized[start_ind])  # in ms
        file_end_list.append(timestamp_serialized[-1])  # in ms

        onsets += timestamp[0]
        offsets += timestamp[0]

        # Demarcate song bouts
        onset_list.append(demarcate_bout(onsets, intervals))
        offset_list.append(demarcate_bout(offsets, intervals))
        duration_list.append(demarcate_bout(durations, intervals))
        syllable_list.append(demarcate_bout(syllables, intervals))
        context_list.append(contexts)

    # Organize event-related info into a single dictionary object
    song_info = {
        "files": file_list,
        "file_start": file_start_list,
        "file_end": file_end_list,
        "onsets": onset_list,
        "offsets": offset_list,
        "durations": duration_list,
        "syllables": syllable_list,
        "contexts": context_list,
    }
    return song_info


[docs]def load_audio(data_path, format="wav") -> dict:
    """
    Load and concatenate all audio files (e.g., .wav) in the input dir (path)

    Parameters
    ----------
    data_path : path

    format : str
        file extension (e.g., '.wav')

    Returns
    -------
    audio_info : dict
    """

    from scipy.io import wavfile

    from ..utils.functions import list_files

    # List all audio files in the dir
    files = list_files(data_path, format)

    # Initialize
    timestamp_concat = np.array([], dtype=np.float64)
    data_concat = np.array([], dtype=np.float64)

    # Store values in these lists
    file_list = []

    # Loop through audio files
    for file in files:
        # Load data file
        print("Loading... " + file.stem)
        sample_rate, data = wavfile.read(file)  # note that the timestamp is in second

        # Add timestamp info
        data_concat = np.append(data_concat, data)

        # Store results
        file_list.append(file.name)

    # Create timestamps
    timestamp_concat = (
        np.arange(0, data_concat.shape[0] / sample_rate, (1 / sample_rate)) * 1e3
    )

    # Organize data into a dictionary
    audio_info = {
        "files": file_list,
        "timestamp": timestamp_concat,
        "data": data_concat,
        "sample_rate": sample_rate,
    }
    file_name = data_path / "AudioData.npy"
    np.save(file_name, audio_info)

    return audio_info