Source code for src.cytodataframe.frame

"""
Defines a CytoDataFrame class.
"""

import base64
import pathlib
import re
from collections.abc import Callable
from io import BytesIO, StringIO
from typing import (
    Any,
    ClassVar,
    Dict,
    List,
    Optional,
    Tuple,
    TypeVar,
    Union,
)

import numpy as np
import pandas as pd
import skimage
import skimage.io
import skimage.measure
from IPython import get_ipython
from pandas._config import (
    get_option,
)
from pandas.io.formats import (
    format as fmt,
)
from skimage.util import img_as_ubyte

from .image import (
    adjust_with_adaptive_histogram_equalization,
    draw_outline_on_image_from_mask,
    draw_outline_on_image_from_outline,
)

# provide backwards compatibility for Self type in earlier Python versions.
# see: https://peps.python.org/pep-0484/#annotating-instance-and-class-methods
CytoDataFrame_type = TypeVar("CytoDataFrame_type", bound="CytoDataFrame")


[docs] class CytoDataFrame(pd.DataFrame): """ A class designed to enhance single-cell data handling by wrapping pandas DataFrame capabilities, providing advanced methods for quality control, comprehensive analysis, and image-based data processing. This class can initialize with either a pandas DataFrame or a file path (CSV, TSV, TXT, or Parquet). When initialized with a file path, it reads the data into a pandas DataFrame. It also includes capabilities to export data. Attributes: _metadata (ClassVar[list[str]]): A class-level attribute that includes custom attributes. _custom_attrs (dict): A dictionary to store custom attributes, such as data source, context directory, and bounding box information. """ _metadata: ClassVar = ["_custom_attrs"] def __init__( # noqa: PLR0913 self: CytoDataFrame_type, data: Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path], data_context_dir: Optional[str] = None, data_image_paths: Optional[pd.DataFrame] = None, data_bounding_box: Optional[pd.DataFrame] = None, data_mask_context_dir: Optional[str] = None, data_outline_context_dir: Optional[str] = None, segmentation_file_regex: Optional[Dict[str, str]] = None, image_adjustment: Optional[Callable] = None, **kwargs: Dict[str, Any], ) -> None: """ Initializes the CytoDataFrame with either a DataFrame or a file path. Args: data (Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path]): The data source, either a pandas DataFrame or a file path. data_context_dir (Optional[str]): Directory context for the image data within the DataFrame. data_image_paths (Optional[pd.DataFrame]): Image path data for the image files. data_bounding_box (Optional[pd.DataFrame]): Bounding box data for the DataFrame images. data_mask_context_dir: Optional[str]: Directory context for the mask data for images. data_outline_context_dir: Optional[str]: Directory context for the outline data for images. segmentation_file_regex: Optional[Dict[str, str]]: A dictionary which includes regex strings for mapping segmentation images (masks or outlines) to unsegmented images. image_adjustment: Callable A callable function which will be used to make image adjustments when they are processed by CytoDataFrame. The function should include a single parameter which takes as input a np.ndarray and return the same after adjustments. Defaults to None, which will incur an adaptive histogram equalization on images. Reference histogram equalization for more information: https://scikit-image.org/docs/stable/auto_examples/color_exposure/ **kwargs: Additional keyword arguments to pass to the pandas read functions. """ self._custom_attrs = { "data_source": None, "data_context_dir": ( data_context_dir if data_context_dir is not None else None ), "data_image_paths": None, "data_bounding_box": None, "data_mask_context_dir": ( data_mask_context_dir if data_mask_context_dir is not None else None ), "data_outline_context_dir": ( data_outline_context_dir if data_outline_context_dir is not None else None ), "segmentation_file_regex": ( segmentation_file_regex if segmentation_file_regex is not None else None ), "image_adjustment": ( image_adjustment if image_adjustment is not None else None ), } if isinstance(data, CytoDataFrame): self._custom_attrs["data_source"] = data._custom_attrs["data_source"] self._custom_attrs["data_context_dir"] = data._custom_attrs[ "data_context_dir" ] self._custom_attrs["data_mask_context_dir"] = data._custom_attrs[ "data_mask_context_dir" ] self._custom_attrs["data_outline_context_dir"] = data._custom_attrs[ "data_outline_context_dir" ] super().__init__(data) elif isinstance(data, (pd.DataFrame, pd.Series)): self._custom_attrs["data_source"] = ( "pandas.DataFrame" if isinstance(data, pd.DataFrame) else "pandas.Series" ) super().__init__(data) elif isinstance(data, (str, pathlib.Path)): data_path = pathlib.Path(data) self._custom_attrs["data_source"] = str(data_path) if data_context_dir is None: self._custom_attrs["data_context_dir"] = str(data_path.parent) else: self._custom_attrs["data_context_dir"] = data_context_dir if data_path.suffix in {".csv", ".tsv", ".txt"} or data_path.suffixes == [ ".csv", ".gz", ]: data = pd.read_csv(data_path, **kwargs) elif data_path.suffix == ".parquet": data = pd.read_parquet(data_path, **kwargs) else: raise ValueError("Unsupported file format for CytoDataFrame.") super().__init__(data) else: super().__init__(data) self._custom_attrs["data_bounding_box"] = ( self.get_bounding_box_from_data() if data_bounding_box is None else data_bounding_box ) self._custom_attrs["data_image_paths"] = ( self.get_image_paths_from_data(image_cols=self.find_image_columns()) if data_image_paths is None else data_image_paths ) def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any: # noqa: ANN401 """ Returns an element or a slice of the underlying pandas DataFrame. Args: key: The key or slice to access the data. Returns: pd.DataFrame or any: The selected element or slice of data. """ result = super().__getitem__(key) if isinstance(result, pd.Series): return result elif isinstance(result, pd.DataFrame): return CytoDataFrame( super().__getitem__(key), data_context_dir=self._custom_attrs["data_context_dir"], data_image_paths=self._custom_attrs["data_image_paths"], data_bounding_box=self._custom_attrs["data_bounding_box"], data_mask_context_dir=self._custom_attrs["data_mask_context_dir"], data_outline_context_dir=self._custom_attrs["data_outline_context_dir"], segmentation_file_regex=self._custom_attrs["segmentation_file_regex"], image_adjustment=self._custom_attrs["image_adjustment"], )
[docs] def _wrap_method( self: CytoDataFrame_type, method: Callable, *args: List[Any], **kwargs: Dict[str, Any], ) -> Any: # noqa: ANN401 """ Wraps a given method to ensure that the returned result is an CytoDataFrame if applicable. Args: method (Callable): The method to be called and wrapped. *args (List[Any]): Positional arguments to be passed to the method. **kwargs (Dict[str, Any]): Keyword arguments to be passed to the method. Returns: Any: The result of the method call. If the result is a pandas DataFrame, it is wrapped in an CytoDataFrame instance with additional context information (data context directory and data bounding box). """ result = method(*args, **kwargs) if isinstance(result, pd.DataFrame): result = CytoDataFrame( result, data_context_dir=self._custom_attrs["data_context_dir"], data_image_paths=self._custom_attrs["data_image_paths"], data_bounding_box=self._custom_attrs["data_bounding_box"], data_mask_context_dir=self._custom_attrs["data_mask_context_dir"], data_outline_context_dir=self._custom_attrs["data_outline_context_dir"], segmentation_file_regex=self._custom_attrs["segmentation_file_regex"], image_adjustment=self._custom_attrs["image_adjustment"], ) return result
[docs] def sort_values( self: CytoDataFrame_type, *args: List[Any], **kwargs: Dict[str, Any] ) -> CytoDataFrame_type: """ Sorts the DataFrame by the specified column(s) and returns a new CytoDataFrame instance. Note: we wrap this method within CytoDataFrame to help ensure the consistent return of CytoDataFrames in the context of pd.Series (which are treated separately but have specialized processing within the context of sort_values). Args: *args (List[Any]): Positional arguments to be passed to the pandas DataFrame's `sort_values` method. **kwargs (Dict[str, Any]): Keyword arguments to be passed to the pandas DataFrame's `sort_values` method. Returns: CytoDataFrame_type: A new instance of CytoDataFrame sorted by the specified column(s). """ return self._wrap_method(super().sort_values, *args, **kwargs)
[docs] def get_bounding_box_from_data( self: CytoDataFrame_type, ) -> Optional[CytoDataFrame_type]: """ Retrieves bounding box data from the DataFrame based on predefined column groups. This method identifies specific groups of columns representing bounding box coordinates for different cellular components (cytoplasm, nuclei, cells) and checks for their presence in the DataFrame. If all required columns are present, it filters and returns a new CytoDataFrame instance containing only these columns. Returns: Optional[CytoDataFrame_type]: A new instance of CytoDataFrame containing the bounding box columns if they exist in the DataFrame. Returns None if the required columns are not found. """ # Define column groups and their corresponding conditions column_groups = { "cyto": [ "Cytoplasm_AreaShape_BoundingBoxMaximum_X", "Cytoplasm_AreaShape_BoundingBoxMaximum_Y", "Cytoplasm_AreaShape_BoundingBoxMinimum_X", "Cytoplasm_AreaShape_BoundingBoxMinimum_Y", ], "nuclei": [ "Nuclei_AreaShape_BoundingBoxMaximum_X", "Nuclei_AreaShape_BoundingBoxMaximum_Y", "Nuclei_AreaShape_BoundingBoxMinimum_X", "Nuclei_AreaShape_BoundingBoxMinimum_Y", ], "cells": [ "Cells_AreaShape_BoundingBoxMaximum_X", "Cells_AreaShape_BoundingBoxMaximum_Y", "Cells_AreaShape_BoundingBoxMinimum_X", "Cells_AreaShape_BoundingBoxMinimum_Y", ], } # Determine which group of columns to select based on availability in self.data selected_group = None for group, cols in column_groups.items(): if all(col in self.columns.tolist() for col in cols): selected_group = group break # Assign the selected columns to self.bounding_box_df if selected_group: return self.filter(items=column_groups[selected_group]) return None
[docs] def export( self: CytoDataFrame_type, file_path: str, **kwargs: Dict[str, Any] ) -> None: """ Exports the underlying pandas DataFrame to a file. Args: file_path (str): The path where the DataFrame should be saved. **kwargs: Additional keyword arguments to pass to the pandas to_* methods. """ data_path = pathlib.Path(file_path) # export to csv if ".csv" in data_path.suffixes: self.to_csv(file_path, **kwargs) # export to tsv elif any(elem in data_path.suffixes for elem in (".tsv", ".txt")): self.to_csv(file_path, sep="\t", **kwargs) # export to parquet elif data_path.suffix == ".parquet": self.to_parquet(file_path, **kwargs) else: raise ValueError("Unsupported file format for export.")
[docs] @staticmethod def is_notebook_or_lab() -> bool: """ Determines if the code is being executed in a Jupyter notebook (.ipynb) returning false if it is not. This method attempts to detect the interactive shell environment using IPython's `get_ipython` function. It checks the class name of the current IPython shell to distinguish between different execution environments. Returns: bool: - `True` if the code is being executed in a Jupyter notebook (.ipynb). - `False` otherwise (e.g., standard Python shell, terminal IPython shell, or scripts). """ try: # check for type of session via ipython shell = get_ipython().__class__.__name__ if "ZMQInteractiveShell" in shell: return True elif "TerminalInteractiveShell" in shell: return False else: return False except NameError: return False
[docs] def find_image_columns(self: CytoDataFrame_type) -> List[str]: """ Find columns containing image file names. This method searches for columns in the DataFrame that contain image file names with extensions .tif or .tiff (case insensitive). Returns: List[str]: A list of column names that contain image file names. """ # build a pattern to match image file names pattern = r".*\.(tif|tiff)$" # search for columns containing image file names # based on pattern above. return [ column for column in self.columns if self[column] .apply( lambda value: isinstance(value, str) and re.match(pattern, value, flags=re.IGNORECASE) ) .any() ]
[docs] def get_image_paths_from_data( self: CytoDataFrame_type, image_cols: List[str] ) -> Dict[str, str]: """ Gather data containing image path names (the directory storing the images but not the file names). We do this by seeking the pattern: Image_FileName_X --> Image_PathName_X. Args: image_cols: List[str]: A list of column names that contain image file names. Returns: Dict[str, str]: A list of column names that contain image file names. """ image_path_columns = [ col.replace("FileName", "PathName") for col in image_cols if col.replace("FileName", "PathName") in self.columns ] return self.filter(items=image_path_columns) if image_path_columns else None
[docs] def find_image_path_columns( self: CytoDataFrame_type, image_cols: List[str], all_cols: List[str] ) -> Dict[str, str]: """ Find columns containing image path names (the directory storing the images but not the file names). We do this by seeking the pattern: Image_FileName_X --> Image_PathName_X. Args: image_cols: List[str]: A list of column names that contain image file names. all_cols: List[str]: A list of all column names. Returns: Dict[str, str]: A list of column names that contain image file names. """ return { col: col.replace("FileName", "PathName") for col in image_cols if col.replace("FileName", "PathName") in all_cols }
[docs] def search_for_mask_or_outline( # noqa: PLR0913, PLR0911 self: CytoDataFrame_type, data_value: str, pattern_map: dict, file_dir: str, candidate_path: pathlib.Path, orig_image: np.ndarray, mask: bool = True, ) -> np.ndarray: """ Search for a mask or outline image file based on the provided patterns and apply it to the target image. Args: data_value (str): The value used to match patterns for locating mask or outline files. pattern_map (dict): A dictionary of file patterns and their corresponding original patterns for matching. file_dir (str): The directory where image files are stored. candidate_path (pathlib.Path): The path to the candidate image file to apply the mask or outline to. orig_image (np.ndarray): The image which will have a mask or outline applied. mask (bool, optional): Whether to search for a mask (True) or an outline (False). Default is True. Returns: np.ndarray: The target image with the applied mask or outline, or None if no relevant file is found. """ if file_dir is None: return None if pattern_map is None: matching_mask_file = list( pathlib.Path(file_dir).rglob(f"{pathlib.Path(candidate_path).stem}*") ) if matching_mask_file: if mask: return draw_outline_on_image_from_mask( orig_image=orig_image, mask_image_path=matching_mask_file[0] ) else: return draw_outline_on_image_from_outline( orig_image=orig_image, outline_image_path=matching_mask_file[0] ) return None for file_pattern, original_pattern in pattern_map.items(): if re.search(original_pattern, data_value): matching_files = [ file for file in pathlib.Path(file_dir).rglob("*") if re.search(file_pattern, file.name) ] if matching_files: if mask: return draw_outline_on_image_from_mask( orig_image=orig_image, mask_image_path=matching_files[0] ) else: return draw_outline_on_image_from_outline( orig_image=orig_image, outline_image_path=matching_files[0] ) return None
[docs] def process_image_data_as_html_display( self: CytoDataFrame_type, data_value: Any, # noqa: ANN401 bounding_box: Tuple[int, int, int, int], image_path: Optional[str] = None, ) -> str: """ Process the image data based on the provided data value and bounding box, applying masks or outlines where applicable, and return an HTML representation of the cropped image for display. Args: data_value (Any): The value to search for in the file system or as the image data. bounding_box (Tuple[int, int, int, int]): The bounding box to crop the image. Returns: str: The HTML image display string, or the unmodified data value if the image cannot be processed. """ candidate_path = None # Get the pattern map for segmentation file regex pattern_map = self._custom_attrs.get("segmentation_file_regex") # Step 1: Find the candidate file if the data value is not already a file if not pathlib.Path(data_value).is_file(): # determine if we have a file from the path (dir) + filename if ( self._custom_attrs["data_context_dir"] is None and image_path is not None and ( existing_image_from_path := pathlib.Path( f"{image_path}/{data_value}" ) ).is_file() ): candidate_path = existing_image_from_path # Search for the data value in the data context directory elif self._custom_attrs["data_context_dir"] is not None and ( candidate_paths := list( pathlib.Path(self._custom_attrs["data_context_dir"]).rglob( data_value ) ) ): # If a candidate file is found, use the first one candidate_path = candidate_paths[0] else: # If no candidate file is found, return the original data value return data_value # read the image as an array orig_image_array = skimage.io.imread(candidate_path) # Adjust the image with image adjustment callable # or adaptive histogram equalization if self._custom_attrs["image_adjustment"] is not None: orig_image_array = self._custom_attrs["image_adjustment"](orig_image_array) else: orig_image_array = adjust_with_adaptive_histogram_equalization( orig_image_array ) # Normalize to 0-255 for image saving orig_image_array = img_as_ubyte(orig_image_array) prepared_image = None # Step 2: Search for a mask prepared_image = self.search_for_mask_or_outline( data_value=data_value, pattern_map=pattern_map, file_dir=self._custom_attrs["data_mask_context_dir"], candidate_path=candidate_path, orig_image=orig_image_array, mask=True, ) # If no mask is found, proceed to search for an outline if prepared_image is None: # Step 3: Search for an outline if no mask was found prepared_image = self.search_for_mask_or_outline( data_value=data_value, pattern_map=pattern_map, file_dir=self._custom_attrs["data_outline_context_dir"], candidate_path=candidate_path, orig_image=orig_image_array, mask=False, ) # Step 4: If neither mask nor outline is found, use the original image array if prepared_image is None: prepared_image = orig_image_array # Step 5: Crop the image based on the bounding box and encode it to PNG format try: x_min, y_min, x_max, y_max = map(int, bounding_box) # Ensure integers cropped_img_array = prepared_image[ y_min:y_max, x_min:x_max ] # Perform slicing except ValueError as e: raise ValueError( f"Bounding box contains invalid values: {bounding_box}" ) from e except IndexError as e: raise IndexError( f"Bounding box {bounding_box} is out of bounds for image dimensions " f"{prepared_image.shape}" ) from e # Step 6: try: # Save cropped image to buffer png_bytes_io = BytesIO() skimage.io.imsave( png_bytes_io, cropped_img_array, plugin="imageio", extension=".png" ) png_bytes = png_bytes_io.getvalue() except (FileNotFoundError, ValueError) as exc: # Handle errors if image processing fails print(exc) return data_value # Return HTML image display as a base64-encoded PNG return ( '<img src="data:image/png;base64,' f'{base64.b64encode(png_bytes).decode("utf-8")}" style="width:300px;"/>' )
[docs] def get_displayed_rows(self: CytoDataFrame_type) -> List[int]: # Get the current display settings max_rows = pd.get_option("display.max_rows") min_rows = pd.get_option("display.min_rows") if len(self) <= max_rows: # If the DataFrame has fewer rows than max_rows, all rows will be displayed return self.index.tolist() else: # Calculate how many rows will be displayed at the beginning and end half_min_rows = min_rows // 2 start_display = self.index[:half_min_rows].tolist() end_display = self.index[-half_min_rows:].tolist() return start_display + end_display
[docs] def _repr_html_( self: CytoDataFrame_type, key: Optional[Union[int, str]] = None ) -> str: """ Returns HTML representation of the underlying pandas DataFrame for use within Juypyter notebook environments and similar. Referenced with modifications from: https://github.com/pandas-dev/pandas/blob/v2.2.2/pandas/core/frame.py#L1216 Modifications added to help achieve image-based output for single-cell data within the context of CytoDataFrame and coSMicQC. Mainly for Jupyter notebooks. Returns: str: The data in a pandas DataFrame. """ if self._info_repr(): buf = StringIO() self.info(buf=buf) # need to escape the <class>, should be the first line. val = buf.getvalue().replace("<", r"&lt;", 1) val = val.replace(">", r"&gt;", 1) return f"<pre>{val}</pre>" if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") # re-add bounding box cols if they are no longer available as in cases # of masking or accessing various pandas attr's bounding_box_externally_joined = False if self._custom_attrs["data_bounding_box"] is not None and not all( col in self.columns.tolist() for col in self._custom_attrs["data_bounding_box"].columns.tolist() ): data = self.join(other=self._custom_attrs["data_bounding_box"]) bounding_box_externally_joined = True else: data = self.copy() # re-add image path (dirs for images) cols if they are no # longer available as in cases of masking or accessing # various pandas attr's image_paths_externally_joined = False if self._custom_attrs["data_image_paths"] is not None and not all( col in self.columns.tolist() for col in self._custom_attrs["data_image_paths"].columns.tolist() ): data = data.join(other=self._custom_attrs["data_image_paths"]) image_paths_externally_joined = True # determine if we have image_cols to display if image_cols := self.find_image_columns(): # attempt to find the image path columns image_path_cols = self.find_image_path_columns( image_cols=image_cols, all_cols=data.columns ) # gather indices which will be displayed based on pandas configuration display_indices = self.get_displayed_rows() # gather bounding box columns for use below bounding_box_cols = self._custom_attrs["data_bounding_box"].columns.tolist() for image_col in image_cols: data.loc[display_indices, image_col] = data.loc[display_indices].apply( lambda row: self.process_image_data_as_html_display( data_value=row[image_col], bounding_box=( # rows below are specified using the column name to # determine which part of the bounding box the columns # relate to (the list of column names could be in # various order). row[ next( col for col in bounding_box_cols if "Minimum_X" in col ) ], row[ next( col for col in bounding_box_cols if "Minimum_Y" in col ) ], row[ next( col for col in bounding_box_cols if "Maximum_X" in col ) ], row[ next( col for col in bounding_box_cols if "Maximum_Y" in col ) ], ), # set the image path based on the image_path cols. image_path=( row[image_path_cols[image_col]] if image_path_cols is not None and image_path_cols != {} else None ), ), axis=1, ) if bounding_box_externally_joined: data = data.drop( self._custom_attrs["data_bounding_box"].columns.tolist(), axis=1 ) if image_paths_externally_joined: data = data.drop( self._custom_attrs["data_image_paths"].columns.tolist(), axis=1 ) formatter = fmt.DataFrameFormatter( data, columns=None, col_space=None, na_rep="NaN", formatters=None, float_format=None, sparsify=None, justify=None, index_names=True, header=True, index=True, bold_rows=True, # note: we avoid escapes to allow HTML rendering for images escape=False, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", ) return fmt.DataFrameRenderer(formatter).to_html() else: return None