JUMP UMAP analysis with coSMicQC¶
This notebook analyzes JUMP data (cpg0000-jump-pilot
) by leveraging UMAP and coSMicQC.
Outline¶
We focus on a single file from the JUMP dataset: BR00117012.sqlite
.
This file is downloaded and prepared by CytoTable to form a single-cell Parquet file which includes all compartment feature data at the single-cell level.
We use coSMicQC to find and remove erroneous outlier data in order to prepare for UMAP analysis.
Afterwards, we use UMAP to demonstrate patterns within the data.
import logging
import pathlib
from typing import List, Optional
import holoviews
import hvplot.pandas
import numpy as np
import pandas as pd
import plotly.express as px
import pycytominer
import umap
from cytotable.convert import convert
from IPython.display import HTML, Image
from parsl.config import Config
from parsl.executors import ThreadPoolExecutor
from pyarrow import parquet
import cosmicqc
# set bokeh for visualizations with hvplot
hvplot.extension("bokeh")
# create a dir for images
pathlib.Path("./images").mkdir(exist_ok=True)
# avoid displaying plot export warnings
logging.getLogger("bokeh.io.export").setLevel(logging.ERROR)
# set the plate name for use throughout the notebook
example_plate = "BR00117012"
%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)
Define utility functions for use within this notebook¶
def generate_umap_embeddings(
df_input: pd.DataFrame,
cols_metadata_to_exclude: List[str],
umap_n_components: int = 2,
random_state: Optional[int] = None,
) -> np.ndarray:
"""
Generates UMAP (Uniform Manifold Approximation and Projection)
embeddings for a given input dataframe,
excluding specified metadata columns.
Args:
df_input (pd.DataFrame]):
A dataframe which is expected to contain
numeric columns to be used for UMAP fitting.
cols_metadata_to_exclude (List[str]):
A list of column names representing
metadata columns that should be excluded
from the UMAP transformation.
umap_n_components: (int):
Number of components to use for UMAP.
Default = 2.
random_state (int):
Number to use for random state and
optional determinism.
Default = None (random each time)
Note: values besides None will turn
off parallelism for umap-learn, likely
meaning increased processing time.
Returns:
np.ndarray:
A dataframe containing the UMAP embeddings
with 2 components for each row in the input.
"""
# Make sure to reinitialize UMAP instance per plate
umap_fit = umap.UMAP(
n_components=umap_n_components,
random_state=random_state,
# set the default value if we didn't set a random_state
# otherwise set to 1 (umap-learn will override anyways).
# this is set to avoid warnings from umap-learn during
# processing.
n_jobs=-1 if random_state is None else 1,
)
# Fit UMAP and convert to pandas DataFrame
embeddings = umap_fit.fit_transform(
X=df_input[
[
col
for col in df_input.columns.tolist()
if col not in cols_metadata_to_exclude and "cqc." not in col
]
# select only numeric types from the dataframe
].select_dtypes(include=[np.number])
)
return embeddings
def plot_hvplot_scatter(
embeddings: np.ndarray,
title: str,
filename: str,
color_dataframe: Optional[pd.DataFrame] = None,
color_column: Optional[str] = None,
bgcolor: str = "black",
cmap: str = "plasma",
clabel: Optional[str] = None,
) -> holoviews.core.spaces.DynamicMap:
"""
Creates an outlier-focused scatter hvplot for viewing
UMAP embedding data with cosmicqc outliers coloration.
Args:
embeddings (np.ndarray]):
A numpy ndarray which includes
embedding data to display.
title (str):
Title for the UMAP scatter plot.
filename (str):
Filename which indicates where to export the
plot.
color_dataframe (pd.DataFrame):
A dataframe which includes data used for
color mapping within the plot. For example,
coSMicQC .is_outlier columns.
color_column (str):
Column name from color_dataframe to use
for coloring the scatter plot.
bgcolor (str):
Sets the background color of the plot.
cmap (str):
Sets the colormap used for the plot.
See here for more:
https://holoviews.org/user_guide/Colormaps.html
clabel (str):
Sets a label on the color map key displayed
horizontally. Defaults to None (no label).
Returns:
holoviews.core.spaces.DynamicMap:
A dynamic holoviews scatter plot which may be
displayed in a Jupyter notebook.
"""
# build a scatter plot through hvplot
plot = pd.DataFrame(embeddings).hvplot.scatter(
title=title,
x="0",
y="1",
alpha=0.1,
rasterize=True,
c=(
color_dataframe[color_column].astype(int).values
if color_dataframe is not None
else None
),
cnorm="linear",
cmap=cmap,
bgcolor=bgcolor,
height=700,
width=800,
clabel=clabel,
)
# export the plot
hvplot.save(obj=plot, filename=filename, center=False)
return plot
Merge single-cell compartment data into one table¶
# check if we already have prepared data
if not pathlib.Path(f"./{example_plate}.parquet").is_file():
# process BR00117012.sqlite using CytoTable to prepare data
merged_single_cells = convert(
source_path=(
"s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace"
"/backend/2020_11_04_CPJUMP1/BR00117012/BR00117012.sqlite"
),
dest_path=f"./{example_plate}.parquet",
dest_datatype="parquet",
source_datatype="sqlite",
chunk_size=8000,
preset="cellprofiler_sqlite_cpg0016_jump",
# allows AWS S3 requests without login
no_sign_request=True,
# use explicit cache to avoid temp cache removal
local_cache_dir="./sqlite_s3_cache/",
parsl_config=Config(
executors=[ThreadPoolExecutor(label="tpe_for_jump_processing")]
),
sort_output=False,
)
else:
merged_single_cells = f"./{example_plate}.parquet"
# read only the metadata from parquet file
parquet.ParquetFile(merged_single_cells).metadata
<pyarrow._parquet.FileMetaData object at 0x724121c0ade0>
created_by: parquet-cpp-arrow version 16.1.0
num_columns: 5804
num_rows: 279789
num_row_groups: 14
format_version: 2.6
serialized_size: 12762532
Process merged single-cell data using coSMicQC¶
# show the first few columns for metadata column names
schema_names = parquet.read_schema(merged_single_cells).names
schema_names[:12]
['Metadata_ImageNumber',
'Image_Metadata_Row',
'Image_Metadata_Site',
'Metadata_ObjectNumber',
'Metadata_ObjectNumber_1',
'Metadata_ObjectNumber_2',
'Metadata_Plate',
'Metadata_Well',
'Image_TableNumber',
'Cytoplasm_AreaShape_Area',
'Cytoplasm_AreaShape_BoundingBoxArea',
'Cytoplasm_AreaShape_BoundingBoxMaximum_X']
# set a list of metadata columns for use throughout
metadata_cols = [
"Metadata_ImageNumber",
"Image_Metadata_Row",
"Image_Metadata_Site",
"Metadata_ObjectNumber",
"Metadata_Plate",
"Metadata_Well",
"Image_TableNumber",
]
# read only metadata columns with feature columns used for outlier detection
df_merged_single_cells = pd.read_parquet(
path=merged_single_cells,
columns=[
*metadata_cols,
"Nuclei_AreaShape_Area",
"Nuclei_AreaShape_FormFactor",
"Nuclei_AreaShape_Eccentricity",
],
)
df_merged_single_cells.head()
Metadata_ImageNumber | Image_Metadata_Row | Image_Metadata_Site | Metadata_ObjectNumber | Metadata_Plate | Metadata_Well | Image_TableNumber | Nuclei_AreaShape_Area | Nuclei_AreaShape_FormFactor | Nuclei_AreaShape_Eccentricity | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 1 | 2 | 2 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1091 | 0.895393 | 0.694154 |
1 | 2 | 1 | 2 | 3 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1007 | 0.837631 | 0.819062 |
2 | 2 | 1 | 2 | 4 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1368 | 0.833197 | 0.820257 |
3 | 2 | 1 | 2 | 5 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 847 | 0.902995 | 0.345575 |
4 | 2 | 1 | 2 | 6 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 983 | 0.863005 | 0.742060 |
# label outliers within the dataset
print("Large nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="large_nuclei",
)
Large nuclei outliers:
Number of outliers: 1355 (0.48%)
Outliers Range:
Nuclei_AreaShape_Area Min: 1754
Nuclei_AreaShape_Area Max: 4414
Nuclei_AreaShape_FormFactor Min: 0.3367261940249281
Nuclei_AreaShape_FormFactor Max: 0.7140072671383899
# label outliers within the dataset
print("Elongated nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="elongated_nuclei",
)
Elongated nuclei outliers:
Number of outliers: 15 (0.01%)
Outliers Range:
Nuclei_AreaShape_Eccentricity Min: 0.9868108584805481
Nuclei_AreaShape_Eccentricity Max: 0.9995098494433163
# label outliers within the dataset
print("Small and low formfactor nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="small_and_low_formfactor_nuclei",
)
Small and low formfactor nuclei outliers:
Number of outliers: 6548 (2.34%)
Outliers Range:
Nuclei_AreaShape_Area Min: 79
Nuclei_AreaShape_Area Max: 744
Nuclei_AreaShape_FormFactor Min: 0.0945907341645769
Nuclei_AreaShape_FormFactor Max: 0.7781815132858318
# label outliers within the dataset
df_labeled_outliers = cosmicqc.analyze.label_outliers(
df=df_merged_single_cells,
include_threshold_scores=True,
)
# show added columns
df_labeled_outliers[
[col for col in df_labeled_outliers.columns.tolist() if "cqc." in col]
].head()
cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.small_and_low_formfactor_nuclei.is_outlier | cqc.elongated_nuclei.Z_Score.Nuclei_AreaShape_Eccentricity | cqc.elongated_nuclei.is_outlier | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.large_nuclei.is_outlier | |
---|---|---|---|---|---|---|---|---|
0 | 0.030121 | 0.826248 | False | -0.154094 | False | 0.030121 | 0.826248 | False |
1 | -0.219592 | -0.073800 | False | 0.765830 | False | -0.219592 | -0.073800 | False |
2 | 0.853580 | -0.142903 | False | 0.774634 | False | 0.853580 | -0.142903 | False |
3 | -0.695236 | 0.944704 | False | -2.721308 | False | -0.695236 | 0.944704 | False |
4 | -0.290938 | 0.321578 | False | 0.198723 | False | -0.290938 | 0.321578 | False |
# create a column which indicates whether an erroneous outlier was detected
# from all cosmicqc outlier threshold sets. For ex. True for is_outlier in
# one threshold set out of three would show True for this column. False for
# is_outlier in all threshold sets would show False for this column.
df_labeled_outliers["analysis.included_at_least_one_outlier"] = df_labeled_outliers[
[col for col in df_labeled_outliers.columns.tolist() if ".is_outlier" in col]
].any(axis=1)
# show value counts for all outliers
outliers_counts = df_labeled_outliers[
"analysis.included_at_least_one_outlier"
].value_counts()
outliers_counts
analysis.included_at_least_one_outlier
False 271883
True 7906
Name: count, dtype: int64
# show the percentage of total dataset
print(
(outliers_counts.iloc[1] / outliers_counts.iloc[0]) * 100,
"%",
"of",
outliers_counts.iloc[0],
"include erroneous outliers of some kind.",
)
2.9078684581235312 % of 271883 include erroneous outliers of some kind.
# show histograms to help visualize the data
df_labeled_outliers.show_report(); # fmt: skip