Data Processing

Basic data processing functions are provided in the dataprocessing module. These functions are used to preprocess data before training a model, or to post-process the output of a model.

Generic Functions

`plantseg.functionals.dataprocessing.dataprocessing.normalize_01(data: np.ndarray, eps=1e-12) -> np.ndarray`

Normalize a numpy array between 0 and 1 and converts it to float32.

Parameters:

data (ndarray) –

Input numpy array
eps (float, default: 1e-12 ) –

A small value added to the denominator for numerical stability

Returns:

normalized_data ( ndarray ) –

Normalized numpy array

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def normalize_01(data: np.ndarray, eps=1e-12) -> np.ndarray:
    """
    Normalize a numpy array between 0 and 1 and converts it to float32.

    Args:
        data (np.ndarray): Input numpy array
        eps (float): A small value added to the denominator for numerical stability

    Returns:
        normalized_data (np.ndarray): Normalized numpy array
    """
    return (data - np.min(data)) / (np.max(data) - np.min(data) + eps).astype("float32")

`plantseg.functionals.dataprocessing.dataprocessing.scale_image_to_voxelsize(image: np.ndarray, input_voxel_size: tuple[float, float, float], output_voxel_size: tuple[float, float, float], order: int = 0) -> np.ndarray`

Scale an image from a given voxel size to another voxel size.

Parameters:

image (ndarray) –

Input image to scale
input_voxel_size (tuple[float, float, float]) –

Input voxel size
output_voxel_size (tuple[float, float, float]) –

Output voxel size
order (int, default: 0 ) –

Interpolation order, must be 0 for segmentation and 1, 2 for images

Returns:

scaled_image ( ndarray ) –

Scaled image as numpy array

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def scale_image_to_voxelsize(
    image: np.ndarray,
    input_voxel_size: tuple[float, float, float],
    output_voxel_size: tuple[float, float, float],
    order: int = 0,
) -> np.ndarray:
    """
    Scale an image from a given voxel size to another voxel size.

    Args:
        image (np.ndarray): Input image to scale
        input_voxel_size (tuple[float, float, float]): Input voxel size
        output_voxel_size (tuple[float, float, float]): Output voxel size
        order (int): Interpolation order, must be 0 for segmentation and 1, 2 for images

    Returns:
        scaled_image (np.ndarray): Scaled image as numpy array
    """
    factor = compute_scaling_factor(input_voxel_size, output_voxel_size)
    return image_rescale(image, factor, order=order)

`plantseg.functionals.dataprocessing.dataprocessing.image_rescale(image: np.ndarray, factor: tuple[float, float, float], order: int) -> np.ndarray`

Scale an image by a given factor in each dimension

Parameters:

image (ndarray) –

Input image to scale
factor (tuple[float, float, float]) –

Scaling factor in each dimension
order (int) –

Interpolation order, must be 0 for segmentation and 1, 2 for images

Returns:

scaled_image ( ndarray ) –

Scaled image as numpy array

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def image_rescale(image: np.ndarray, factor: tuple[float, float, float], order: int) -> np.ndarray:
    """
    Scale an image by a given factor in each dimension

    Args:
        image (np.ndarray): Input image to scale
        factor (tuple[float, float, float]): Scaling factor in each dimension
        order (int): Interpolation order, must be 0 for segmentation and 1, 2 for images

    Returns:
        scaled_image (np.ndarray): Scaled image as numpy array
    """
    if np.array_equal(factor, [1.0, 1.0, 1.0]):
        return image
    else:
        return zoom(image, zoom=factor, order=order)

`plantseg.functionals.dataprocessing.dataprocessing.image_median(image: np.ndarray, radius: int) -> np.ndarray`

Apply median smoothing on an image with a given radius.

Parameters:

image (ndarray) –

Input image to apply median smoothing.
radius (int) –

Radius of the median filter.

Returns:

ndarray –

np.ndarray: Median smoothed image.

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def image_median(image: np.ndarray, radius: int) -> np.ndarray:
    """
    Apply median smoothing on an image with a given radius.

    Args:
        image (np.ndarray): Input image to apply median smoothing.
        radius (int): Radius of the median filter.

    Returns:
        np.ndarray: Median smoothed image.
    """
    if radius <= 0:
        raise ValueError("Radius must be a positive integer.")

    if image.ndim == 2:
        # 2D image
        return median(image, disk(radius))
    elif image.ndim == 3:
        if image.shape[0] == 1:
            # Single slice (ZYX or YX) case
            return median(image[0], disk(radius)).reshape(image.shape)
        else:
            # 3D image
            return median(image, ball(radius))
    else:
        raise ValueError("Unsupported image dimensionality. Image must be either 2D or 3D.")

`plantseg.functionals.dataprocessing.dataprocessing.image_gaussian_smoothing(image: np.ndarray, sigma: float) -> np.ndarray`

Apply gaussian smoothing on an image with a given sigma.

Parameters:

image (ndarray) –

Input image to apply gaussian smoothing
sigma (float) –

Sigma value for gaussian smoothing

Returns:

smoothed_image ( ndarray ) –

Gaussian smoothed image as numpy array

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def image_gaussian_smoothing(image: np.ndarray, sigma: float) -> np.ndarray:
    """
    Apply gaussian smoothing on an image with a given sigma.

    Args:
        image (np.ndarray): Input image to apply gaussian smoothing
        sigma (float): Sigma value for gaussian smoothing

    Returns:
        smoothed_image (np.ndarray): Gaussian smoothed image as numpy array
    """
    image = image.astype("float32")
    max_sigma = (np.array(image.shape) - 1) / 3
    sigma_array = np.minimum(max_sigma, np.ones(max_sigma.ndim) * sigma)
    return gaussianSmoothing(image, sigma_array)

`plantseg.functionals.dataprocessing.dataprocessing.image_crop(image: np.ndarray, crop_str: str) -> np.ndarray`

Crop an image from a crop string like [:, 10:30:, 10:20]

Parameters:

image (ndarray) –

Input image to crop
crop_str (str) –

Crop string

Returns:

cropped_image ( ndarray ) –

Cropped image as numpy array

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def image_crop(image: np.ndarray, crop_str: str) -> np.ndarray:
    """
    Crop an image from a crop string like [:, 10:30:, 10:20]

    Args:
        image (np.ndarray): Input image to crop
        crop_str (str): Crop string

    Returns:
        cropped_image (np.ndarray): Cropped image as numpy array
    """
    crop_str = crop_str.replace("[", "").replace("]", "")
    slices = tuple(
        (slice(*(int(i) if i else None for i in part.strip().split(":"))) if ":" in part else int(part.strip()))
        for part in crop_str.split(",")
    )
    return image[slices]

`plantseg.functionals.dataprocessing.dataprocessing.process_images(image1: np.ndarray, image2: np.ndarray, operation: ImagePairOperation, normalize_input: bool = False, clip_output: bool = False, normalize_output: bool = True) -> np.ndarray`

General function for performing image operations with optional preprocessing and post-processing.

Parameters:

image1 (ndarray) –

First input image.
image2 (ndarray) –

Second input image.
operation (str) –

Operation to perform ('add', 'multiply', 'subtract', 'divide', 'max').
normalize_input (bool, default: False ) –

Whether to normalize the input images to the range [0, 1]. Default is False.
clip_output (bool, default: False ) –

Whether to clip the resulting image values to the range [0, 1]. Default is False.
normalize_output (bool, default: True ) –

Whether to normalize the output image to the range [0, 1]. Default is True.

Returns:

ndarray –

np.ndarray: The resulting image after performing the operation.

Source code in plantseg/functionals/dataprocessing/dataprocessing.py

def process_images(
    image1: np.ndarray,
    image2: np.ndarray,
    operation: ImagePairOperation,
    normalize_input: bool = False,
    clip_output: bool = False,
    normalize_output: bool = True,
) -> np.ndarray:
    """
    General function for performing image operations with optional preprocessing and post-processing.

    Args:
        image1 (np.ndarray): First input image.
        image2 (np.ndarray): Second input image.
        operation (str): Operation to perform ('add', 'multiply', 'subtract', 'divide', 'max').
        normalize_input (bool): Whether to normalize the input images to the range [0, 1]. Default is False.
        clip_output (bool): Whether to clip the resulting image values to the range [0, 1]. Default is False.
        normalize_output (bool): Whether to normalize the output image to the range [0, 1]. Default is True.

    Returns:
        np.ndarray: The resulting image after performing the operation.
    """
    # Preprocessing: Normalize input images if specified
    if normalize_input:
        image1, image2 = normalize_01(image1), normalize_01(image2)

    # Perform the specified operation
    if operation == "add":
        result = image1 + image2
    elif operation == "multiply":
        result = image1 * image2
    elif operation == "subtract":
        result = image1 - image2
    elif operation == "divide":
        result = image1 / image2
    elif operation == "max":
        result = np.maximum(image1, image2)
    else:
        raise ValueError(f"Unsupported operation: {operation}")

    # Post-processing: Clip and/or normalize output if specified
    if clip_output:
        result = np.clip(result, 0, 1)
    if normalize_output:
        result = normalize_01(result)

    return result

Segmentation Functions

`plantseg.functionals.dataprocessing.labelprocessing.relabel_segmentation(segmentation_image: np.ndarray, background: int | None = None) -> np.ndarray`

Relabels contiguously a segmentation image, non-touching instances with same id will be relabeled differently. To be noted that measure.label is different from ndimage.label.

1-connectivity 2-connectivity diagonal connection close-up

 [ ]           [ ]  [ ]  [ ]             [ ]
  |               \  |  /                 |  <- hop 2

[ ]--[x]--[ ] [ ]--[x]--[ ] [x]--[ ] | / | \ hop 1 [ ] [ ] [ ] [ ]

Parameters:

segmentation_image (ndarray) –

A 2D or 3D segmentation image where connected components represent different instances.
background (int | None, default: None ) –

Label of the background. If None, the function will assume the background label is 0. Default is None.

Returns:

ndarray –

np.ndarray: A relabeled segmentation image where each connected component is assigned a unique integer label.

Source code in plantseg/functionals/dataprocessing/labelprocessing.py

def relabel_segmentation(segmentation_image: np.ndarray, background: int | None = None) -> np.ndarray:
    r"""
    Relabels contiguously a segmentation image, non-touching instances with same id will be relabeled differently.
    To be noted that measure.label is different from ndimage.label.

    1-connectivity     2-connectivity     diagonal connection close-up

         [ ]           [ ]  [ ]  [ ]             [ ]
          |               \  |  /                 |  <- hop 2
    [ ]--[x]--[ ]      [ ]--[x]--[ ]        [x]--[ ]
          |               /  |  \             hop 1
         [ ]           [ ]  [ ]  [ ]

    Args:
        segmentation_image (np.ndarray): A 2D or 3D segmentation image where connected components represent different instances.
        background (int | None, optional): Label of the background. If None, the function will assume the background
                                           label is 0. Default is None.

    Returns:
        np.ndarray: A relabeled segmentation image where each connected component is assigned a unique integer label.
    """
    relabeled_segmentation = measure.label(
        segmentation_image,
        background=background,
        return_num=False,
        connectivity=None,
    )
    assert isinstance(relabeled_segmentation, np.ndarray)
    return relabeled_segmentation

`plantseg.functionals.dataprocessing.labelprocessing.set_background_to_value(segmentation_image: np.ndarray, value: int = 0) -> np.ndarray`

Sets all occurrences of the background (label 0) in the segmentation image to a new value.

Parameters:

segmentation_image (ndarray) –

A 2D or 3D numpy array representing an instance segmentation.
value (int, default: 0 ) –

The value to assign to the background. Default is 0.

Returns:

ndarray –

np.ndarray: A segmentation image where all background pixels (originally 0) are set to value.

Source code in plantseg/functionals/dataprocessing/labelprocessing.py

def set_background_to_value(segmentation_image: np.ndarray, value: int = 0) -> np.ndarray:
    """
    Sets all occurrences of the background (label 0) in the segmentation image to a new value.

    Args:
        segmentation_image (np.ndarray): A 2D or 3D numpy array representing an instance segmentation.
        value (int, optional): The value to assign to the background. Default is 0.

    Returns:
        np.ndarray: A segmentation image where all background pixels (originally 0) are set to `value`.
    """
    return np.where(segmentation_image == 0, value, segmentation_image)

Advanced Functions

`plantseg.functionals.dataprocessing.advanced_dataprocessing.fix_over_under_segmentation_from_nuclei(cell_seg: np.ndarray, nuclei_seg: np.ndarray, threshold_merge: float, threshold_split: float, quantile_min: float, quantile_max: float, boundary: np.ndarray | None = None) -> np.ndarray`

Correct over-segmentation and under-segmentation of cells based on nuclei information.

This function uses information from nuclei segmentation to refine cell segmentation by first identifying over-segmented cells (cells mistakenly split into multiple segments) and merging them. It then corrects under-segmented cells (multiple nuclei within a single cell) by splitting them based on nuclei position and optional boundary information.

Parameters:

cell_seg (ndarray) –

A 2D or 3D array of segmented cells, where each integer represents a unique cell.
nuclei_seg (ndarray) –

A 2D or 3D array of segmented nuclei, matching the shape of cell_seg. Used to guide merging and splitting.
threshold_merge (float) –

A value between 0 and 1. Cells with less than this fraction of nuclei overlap are considered over-segmented and will be merged. Default is 0.33.
threshold_split (float) –

A value between 0 and 1. Cells with more than this fraction of nuclei overlap are considered under-segmented and will be split. Default is 0.66.
quantile_min (float) –

The lower size limit for nuclei, as a fraction (0-1). Nuclei smaller than this quantile are ignored. Default is 0.3.
quantile_max (float) –

The upper size limit for nuclei, as a fraction (0-1). Nuclei larger than this quantile are ignored. Default is 0.99.
boundary (ndarray | None, default: None ) –

Optional boundary map of the same shape as cell_seg. High values indicate cell boundaries and help refine splitting. If None, all regions are treated equally.

Returns:

ndarray –

np.ndarray: Corrected cell segmentation array.

Source code in plantseg/functionals/dataprocessing/advanced_dataprocessing.py

def fix_over_under_segmentation_from_nuclei(
    cell_seg: np.ndarray,
    nuclei_seg: np.ndarray,
    threshold_merge: float,
    threshold_split: float,
    quantile_min: float,
    quantile_max: float,
    boundary: np.ndarray | None = None,
) -> np.ndarray:
    """
    Correct over-segmentation and under-segmentation of cells based on nuclei information.

    This function uses information from nuclei segmentation to refine cell segmentation by first identifying
    over-segmented cells (cells mistakenly split into multiple segments) and merging them. It then corrects
    under-segmented cells (multiple nuclei within a single cell) by splitting them based on nuclei position
    and optional boundary information.

    Args:
        cell_seg (np.ndarray): A 2D or 3D array of segmented cells, where each integer represents a unique cell.
        nuclei_seg (np.ndarray): A 2D or 3D array of segmented nuclei, matching the shape of `cell_seg`.
            Used to guide merging and splitting.
        threshold_merge (float, optional): A value between 0 and 1. Cells with less than this fraction of nuclei overlap
            are considered over-segmented and will be merged. Default is 0.33.
        threshold_split (float, optional): A value between 0 and 1. Cells with more than this fraction of nuclei overlap
            are considered under-segmented and will be split. Default is 0.66.
        quantile_min (float, optional): The lower size limit for nuclei, as a fraction (0-1). Nuclei smaller than this
            quantile are ignored. Default is 0.3.
        quantile_max (float, optional): The upper size limit for nuclei, as a fraction (0-1). Nuclei larger than this
            quantile are ignored. Default is 0.99.
        boundary (np.ndarray | None, optional): Optional boundary map of the same shape as `cell_seg`. High values
            indicate cell boundaries and help refine splitting. If None, all regions are treated equally.

    Returns:
        np.ndarray: Corrected cell segmentation array.
    """
    # Find overlaps between cells and nuclei
    cell_counts, nuclei_counts, cell_nuclei_counts = numba_find_overlaps(cell_seg, nuclei_seg)

    # Identify over-segmentation and correct it
    nuclei_assignments = find_potential_over_seg(nuclei_counts, cell_nuclei_counts, threshold=threshold_merge)
    corrected_seg = fix_over_segmentation(cell_seg, nuclei_assignments)

    # Identify under-segmentation and correct it
    cell_counts, nuclei_counts, cell_nuclei_counts = numba_find_overlaps(corrected_seg, nuclei_seg)
    cell_assignments = find_potential_under_seg(
        nuclei_counts,
        cell_counts,
        cell_nuclei_counts,
        threshold=threshold_split,
        quantiles_clip=(quantile_min, quantile_max),
    )

    boundary_pmap = np.ones_like(cell_seg) if boundary is None else boundary
    return fix_under_segmentation(corrected_seg, nuclei_seg, boundary_pmap, cell_assignments, cell_idx=None)

`plantseg.functionals.dataprocessing.advanced_dataprocessing.remove_false_positives_by_foreground_probability(segmentation: np.ndarray, foreground: np.ndarray, threshold: float) -> np.ndarray`

Removes false positive regions in a segmentation based on a foreground probability map.

Labels are not preserved.
If the mean(an instance * its own probability region) < threshold, it is removed.

Parameters:

segmentation (ndarray) –

Segmentation array where each unique non-zero value indicates a distinct region.
foreground (ndarray) –

Foreground probability map of the same shape as segmentation.
threshold (float) –

Probability threshold below which regions are considered false positives.

Returns:

ndarray –

np.ndarray: Segmentation array with false positives removed.

Source code in plantseg/functionals/dataprocessing/advanced_dataprocessing.py

def remove_false_positives_by_foreground_probability(
    segmentation: np.ndarray,
    foreground: np.ndarray,
    threshold: float,
) -> np.ndarray:
    """
    Removes false positive regions in a segmentation based on a foreground probability map.

    1. Labels are not preserved.
    2. If the mean(an instance * its own probability region) < threshold, it is removed.

    Args:
        segmentation (np.ndarray): Segmentation array where each unique non-zero value indicates a distinct region.
        foreground (np.ndarray): Foreground probability map of the same shape as `segmentation`.
        threshold (float): Probability threshold below which regions are considered false positives.

    Returns:
        np.ndarray: Segmentation array with false positives removed.
    """
    # TODO: make a channel for removed regions for easier inspection
    # TODO: use `relabel_sequential` to recover the original labels

    if segmentation.shape != foreground.shape:
        raise ValueError("Segmentation and probability map must have the same shape.")
    if foreground.max() > 1:
        raise ValueError("Foreground must be a probability map with values in [0, 1].")

    instances, _, _ = relabel_sequential(segmentation)  # The label 0 is assumed to denote the bg and is never remapped.
    regions = regionprops(instances)  # Labels with value 0 are ignored.

    pixel_count = np.zeros(len(regions) + 1)
    pixel_value = np.zeros(len(regions) + 1)
    pixel_count[0] = 1  # Avoid division by zero: pixel_count[0] and pixel_value[0] are fixed throughout.

    for region in tqdm.tqdm(regions):
        bbox = region.bbox
        if instances.ndim == 3:
            slices = (slice(bbox[0], bbox[3]), slice(bbox[1], bbox[4]), slice(bbox[2], bbox[5]))
        else:
            slices = (slice(bbox[0], bbox[2]), slice(bbox[1], bbox[3]))

        region_mask = instances[slices] == region.label
        prob = foreground[slices]

        pixel_count[region.label] = region.area
        pixel_value[region.label] = (region_mask * prob).sum()

    likelihood = pixel_value / pixel_count
    to_remove = likelihood < threshold

    instances[np.isin(instances, np.nonzero(to_remove)[0])] = 0
    instances, _, _ = relabel_sequential(instances)
    return instances