Skip to content

Normalization

Normalization statistics computation for time series datasets.

NormPair dataclass

NormPair(mean: ndarray, std: ndarray, min: ndarray, max: ndarray)

Per-signal normalization statistics (mean, std, min, max as 1-D numpy arrays).

Parameters:

Name Type Description Default
mean ndarray

per-feature mean values

required
std ndarray

per-feature standard deviation values

required
min ndarray

per-feature minimum values

required
max ndarray

per-feature maximum values

required

__add__

__add__(other: NormPair) -> NormPair

Concatenate two NormPairs feature-wise.

Source code in tsfast/tsdata/norm.py
def __add__(self, other: "NormPair") -> "NormPair":
    """Concatenate two NormPairs feature-wise."""
    return NormPair(*(np.hstack([a, b]) for a, b in zip(self, other)))

NormStats

Bases: NamedTuple

Normalization statistics for input and output signals.

Parameters:

Name Type Description Default
u

normalization stats for input signals

required
y

normalization stats for output signals

required

compute_stats_from_files

compute_stats_from_files(files: list, signals: list[str]) -> NormPair | None

Compute exact NormPair (mean, std, min, max) from all samples in HDF5 files.

Parameters:

Name Type Description Default
files list

paths to HDF5 files

required
signals list[str]

signal dataset names within each file

required
Source code in tsfast/tsdata/norm.py
def compute_stats_from_files(files: list, signals: list[str]) -> NormPair | None:
    """Compute exact NormPair (mean, std, min, max) from all samples in HDF5 files.

    Args:
        files: paths to HDF5 files
        signals: signal dataset names within each file
    """
    if len(signals) == 0:
        return None

    sums = np.zeros(len(signals))
    squares = np.zeros(len(signals))
    mins = np.full(len(signals), np.inf)
    maxs = np.full(len(signals), -np.inf)
    counts = np.zeros(len(signals))

    for file in files:
        with h5py.File(file, "r") as f:
            for i, signal in enumerate(signals):
                data = f[signal][:]
                if data.ndim > 1:
                    raise ValueError(f"Each dataset in a file has to be 1d. {signal} is {data.ndim}.")
                sums[i] += np.nansum(data)
                squares[i] += np.nansum(data**2)
                mins[i] = min(mins[i], np.nanmin(data))
                maxs[i] = max(maxs[i], np.nanmax(data))
                counts[i] += np.sum(~np.isnan(data))

    means = sums / counts
    stds = np.sqrt((squares / counts) - (means**2))
    return NormPair(
        means.astype(np.float32),
        stds.astype(np.float32),
        mins.astype(np.float32),
        maxs.astype(np.float32),
    )

compute_stats

compute_stats(dl, n_batches: int = 10) -> tuple[NormPair, ...]

Estimate per-feature mean/std/min/max from training batches.

Parameters:

Name Type Description Default
dl

DataLoader to sample from

required
n_batches int

number of batches to use for estimation

10
Source code in tsfast/tsdata/norm.py
def compute_stats(dl, n_batches: int = 10) -> tuple[NormPair, ...]:
    """Estimate per-feature mean/std/min/max from training batches.

    Args:
        dl: DataLoader to sample from
        n_batches: number of batches to use for estimation
    """
    acc = None
    for i, batch in enumerate(dl):
        if i >= n_batches:
            break
        if acc is None:
            acc = [[t] for t in batch]
        else:
            for j, t in enumerate(batch):
                acc[j].append(t)

    stats = []
    for tensors in acc:
        t = torch.cat(tensors).flatten(0, -2)  # [total_samples, features]
        mean = torch.nanmean(t, 0)
        std = torch.nanmean((t - mean).pow(2), 0).sqrt()
        t_min = torch.where(torch.isnan(t), torch.tensor(float("inf")), t).min(0).values
        t_max = torch.where(torch.isnan(t), torch.tensor(float("-inf")), t).max(0).values
        stats.append(
            NormPair(
                mean=mean.cpu().numpy().astype(np.float32),
                std=std.cpu().numpy().astype(np.float32),
                min=t_min.cpu().numpy().astype(np.float32),
                max=t_max.cpu().numpy().astype(np.float32),
            )
        )
    return tuple(stats)