Skip to content

Data Splitting

File discovery and train/valid/test splitting utilities.

get_hdf_files

get_hdf_files(path: Path | str, recurse: bool = True) -> list[Path]

Recursively find .hdf5/.h5 files under path.

Source code in tsfast/tsdata/split.py
def get_hdf_files(path: Path | str, recurse: bool = True) -> list[Path]:
    """Recursively find .hdf5/.h5 files under path."""
    path = Path(path)
    if not path.exists():
        return []
    if recurse:
        return sorted(f for f in path.rglob("*") if f.suffix in HDF_EXTENSIONS)
    return sorted(f for f in path.iterdir() if f.suffix in HDF_EXTENSIONS)

discover_split_files

discover_split_files(path: Path | str, train_name: str = 'train', valid_name: str = 'valid', test_name: str = 'test') -> dict[str, list[Path]]

Auto-discover train/valid/test HDF5 files by parent directory name.

Parameters:

Name Type Description Default
path Path | str

root directory containing train/valid/test subdirectories

required
train_name str

name of training subdirectory

'train'
valid_name str

name of validation subdirectory

'valid'
test_name str

name of test subdirectory

'test'
Source code in tsfast/tsdata/split.py
def discover_split_files(
    path: Path | str,
    train_name: str = "train",
    valid_name: str = "valid",
    test_name: str = "test",
) -> dict[str, list[Path]]:
    """Auto-discover train/valid/test HDF5 files by parent directory name.

    Args:
        path: root directory containing train/valid/test subdirectories
        train_name: name of training subdirectory
        valid_name: name of validation subdirectory
        test_name: name of test subdirectory
    """
    path = Path(path)
    files = get_hdf_files(path)
    return {
        "train": [f for f in files if f.parent.name == train_name],
        "valid": [f for f in files if f.parent.name == valid_name],
        "test": [f for f in files if f.parent.name == test_name],
    }

split_by_parent

split_by_parent(files: list, train_name: str = 'train', valid_name: str = 'valid') -> tuple[list[int], list[int]]

Return (train_indices, valid_indices) based on parent directory names.

Parameters:

Name Type Description Default
files list

list of file paths

required
train_name str

parent directory name for training files

'train'
valid_name str

parent directory name for validation files

'valid'
Source code in tsfast/tsdata/split.py
def split_by_parent(
    files: list,
    train_name: str = "train",
    valid_name: str = "valid",
) -> tuple[list[int], list[int]]:
    """Return (train_indices, valid_indices) based on parent directory names.

    Args:
        files: list of file paths
        train_name: parent directory name for training files
        valid_name: parent directory name for validation files
    """
    train_idxs = [i for i, f in enumerate(files) if Path(f).parent.name == train_name]
    valid_idxs = [i for i, f in enumerate(files) if Path(f).parent.name == valid_name]
    return train_idxs, valid_idxs

split_by_percentage

split_by_percentage(files: list, pct: float = 0.8) -> tuple[list[int], list[int]]

Sequential percentage split.

Parameters:

Name Type Description Default
files list

list of items to split

required
pct float

fraction of items assigned to the first split

0.8
Source code in tsfast/tsdata/split.py
def split_by_percentage(files: list, pct: float = 0.8) -> tuple[list[int], list[int]]:
    """Sequential percentage split.

    Args:
        files: list of items to split
        pct: fraction of items assigned to the first split
    """
    split_idx = int(len(files) * pct)
    return list(range(split_idx)), list(range(split_idx, len(files)))

is_dataset_directory

is_dataset_directory(path: Path | str) -> bool

Check if path contains train/valid/test subdirectories with HDF5 files.

Source code in tsfast/tsdata/split.py
def is_dataset_directory(path: Path | str) -> bool:
    """Check if path contains train/valid/test subdirectories with HDF5 files."""
    for dir_name in ("train", "valid", "test"):
        dir_path = os.path.join(path, dir_name)
        if not os.path.isdir(dir_path):
            return False
        if not get_hdf_files(dir_path):
            return False
    return True