coSMicQC
in a nutshell¶
This notebook demonstrates various capabilities of coSMicQC
using examples.
import pathlib
import pandas as pd
import cosmicqc
# set a path for the parquet-based dataset
# (in this case, CellProfiler SQLite data processed by CytoTable)
data_path = (
"../../../tests/data/cytotable/NF1_cellpainting_data/"
"Plate_2_with_image_data.parquet"
)
# set a context directory for images associated with the dataset
image_context_dir = pathlib.Path(data_path).parent / "Plate_2_images"
mask_context_dir = pathlib.Path(data_path).parent / "Plate_2_masks"
# create a cosmicqc CytoDataFrame (single-cell DataFrame)
scdf = cosmicqc.CytoDataFrame(
data=data_path,
data_context_dir=image_context_dir,
data_mask_context_dir=mask_context_dir,
)
# display the dataframe
scdf
Metadata_ImageNumber | Image_Metadata_Plate_x | Metadata_number_of_singlecells | Image_Metadata_Site_x | Image_Metadata_Well_x | Metadata_Cells_Number_Object_Number | Metadata_Cytoplasm_Parent_Cells | Metadata_Cytoplasm_Parent_Nuclei | Metadata_Nuclei_Number_Object_Number | Cytoplasm_AreaShape_Area | ... | Image_Threshold_SumOfEntropies_Cells | Image_Threshold_SumOfEntropies_Nuclei | Image_Threshold_WeightedVariance_Cells | Image_Threshold_WeightedVariance_Nuclei | Image_URL_DAPI | Image_URL_GFP | Image_URL_RFP | Image_Width_DAPI | Image_Width_GFP | Image_Width_RFP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Plate_2 | 44 | 1 | A12 | 1 | 1 | 2 | 2 | 21024.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_1_1_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_2_1_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_3_1_RFP_001.tif | 1224 | 1224 | 1224 |
1 | 1 | Plate_2 | 44 | 1 | A12 | 4 | 4 | 7 | 7 | 12754.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_1_1_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_2_1_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_3_1_RFP_001.tif | 1224 | 1224 | 1224 |
2 | 1 | Plate_2 | 44 | 1 | A12 | 7 | 7 | 10 | 10 | 23976.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_1_1_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_2_1_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_3_1_RFP_001.tif | 1224 | 1224 | 1224 |
3 | 1 | Plate_2 | 44 | 1 | A12 | 8 | 8 | 12 | 12 | 19374.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_1_1_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_2_1_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_3_1_RFP_001.tif | 1224 | 1224 | 1224 |
4 | 1 | Plate_2 | 44 | 1 | A12 | 9 | 9 | 13 | 13 | 27385.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_1_1_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_2_1_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/A12_01_3_1_RFP_001.tif | 1224 | 1224 | 1224 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1709 | 128 | Plate_2 | 59 | 4 | H7 | 10 | 10 | 14 | 14 | 24942.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_1_4_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_2_4_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_3_4_RFP_001.tif | 1224 | 1224 | 1224 |
1710 | 128 | Plate_2 | 59 | 4 | H7 | 11 | 11 | 15 | 15 | 6627.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_1_4_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_2_4_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_3_4_RFP_001.tif | 1224 | 1224 | 1224 |
1711 | 128 | Plate_2 | 59 | 4 | H7 | 12 | 12 | 16 | 16 | 11216.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_1_4_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_2_4_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_3_4_RFP_001.tif | 1224 | 1224 | 1224 |
1712 | 128 | Plate_2 | 59 | 4 | H7 | 13 | 13 | 17 | 17 | 15279.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_1_4_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_2_4_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_3_4_RFP_001.tif | 1224 | 1224 | 1224 |
1713 | 128 | Plate_2 | 59 | 4 | H7 | 14 | 14 | 20 | 20 | 7106.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_1_4_DAPI_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_2_4_GFP_001.tif | file:/home/jenna/nf1_cellpainting_data/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_2/H7_01_3_4_RFP_001.tif | 1224 | 1224 | 1224 |
1714 rows × 2076 columns
# Identify which rows include outliers for a given threshold definition
# which references a column name and a z-score number which is considered
# the limit.
cosmicqc.analyze.identify_outliers(
df=scdf,
feature_thresholds={"Nuclei_AreaShape_Area": -1},
).sort_values()
0 False
1085 False
1083 False
1082 False
1080 False
...
572 True
571 True
567 True
280 True
856 True
Name: cqc.custom.Z_Score.Nuclei_AreaShape_Area, Length: 1714, dtype: bool
# Show the number of outliers given a column name and a specified threshold
# via the `feature_thresholds` parameter and the `find_outliers` function.
cosmicqc.analyze.find_outliers(
df=scdf,
metadata_columns=["Metadata_ImageNumber", "Image_Metadata_Plate_x"],
feature_thresholds={"Nuclei_AreaShape_Area": -1},
)
Number of outliers: 328 (19.14%)
Outliers Range:
Nuclei_AreaShape_Area Min: 734.0
Nuclei_AreaShape_Area Max: 1904.0
Nuclei_AreaShape_Area Metadata_ImageNumber Image_Metadata_Plate_x
23 921.0 2 Plate_2
28 845.0 2 Plate_2
29 1024.0 2 Plate_2
32 787.0 2 Plate_2
37 1347.0 2 Plate_2
... ... ... ...
1682 1497.0 127 Plate_2
1689 1794.0 127 Plate_2
1692 1732.0 127 Plate_2
1699 1149.0 127 Plate_2
1707 1594.0 128 Plate_2
[328 rows x 3 columns]
# create a labeled dataset which includes z-scores and whether those scores
# are interpreted as outliers or inliers. We use pre-defined threshold sets
# loaded from defaults (cosmicqc can accept user-defined thresholds too!).
labeled_scdf = cosmicqc.analyze.label_outliers(
df=scdf,
include_threshold_scores=True,
)
# show the dataframe rows with only the last 8 columns
# (added from the label_outliers function)
labeled_scdf.iloc[:, -8:]
cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.small_and_low_formfactor_nuclei.is_outlier | cqc.elongated_nuclei.Z_Score.Nuclei_AreaShape_Eccentricity | cqc.elongated_nuclei.is_outlier | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.large_nuclei.is_outlier | |
---|---|---|---|---|---|---|---|---|
0 | 0.848820 | 0.219903 | False | 0.498274 | False | 0.848820 | 0.219903 | False |
1 | -0.252521 | -1.280795 | False | -0.659400 | False | -0.252521 | -1.280795 | False |
2 | -0.402491 | -0.325652 | False | 0.819165 | False | -0.402491 | -0.325652 | False |
3 | 0.329549 | -0.268920 | False | 0.961218 | False | 0.329549 | -0.268920 | False |
4 | 1.153446 | 0.028845 | False | -0.372891 | False | 1.153446 | 0.028845 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1709 | 0.598557 | -0.280063 | False | 0.923075 | False | 0.598557 | -0.280063 | False |
1710 | -0.716490 | 0.068293 | False | 0.650830 | False | -0.716490 | 0.068293 | False |
1711 | 1.187189 | 0.833264 | False | -0.752359 | False | 1.187189 | 0.833264 | False |
1712 | -0.699619 | 0.534479 | False | -0.747030 | False | -0.699619 | 0.534479 | False |
1713 | -0.990185 | 0.356614 | False | -1.309290 | False | -0.990185 | 0.356614 | False |
1714 rows × 8 columns
# show histogram reports on the outliers and inliers
# for each threshold set in the new columns
labeled_scdf.show_report(); # fmt: skip
# show cropped images through CytoDataFrame from the dataset to help analyze outliers
labeled_scdf.sort_values(by="cqc.large_nuclei.is_outlier", ascending=False)[
[
"Metadata_ImageNumber",
"Metadata_Cells_Number_Object_Number",
"cqc.large_nuclei.is_outlier",
"Image_FileName_GFP",
"Image_FileName_RFP",
"Image_FileName_DAPI",
]
]
Metadata_ImageNumber | Metadata_Cells_Number_Object_Number | cqc.large_nuclei.is_outlier | Image_FileName_GFP | Image_FileName_RFP | Image_FileName_DAPI | |
---|---|---|---|---|---|---|
699 | 50 | 2 | True | |||
1557 | 113 | 10 | True | |||
1677 | 126 | 9 | True | |||
457 | 34 | 6 | True | |||
882 | 61 | 6 | True | |||
... | ... | ... | ... | ... | ... | ... |
570 | 45 | 13 | False | |||
569 | 45 | 10 | False | |||
568 | 45 | 9 | False | |||
567 | 45 | 8 | False | |||
1713 | 128 | 14 | False |
1714 rows × 6 columns
# One can convert from cosmicqc.CytoDataFrame to pd.DataFrame's
# (when or if needed!)
df = pd.DataFrame(scdf)
print(type(df))
df
<class 'pandas.core.frame.DataFrame'>
Metadata_ImageNumber | Image_Metadata_Plate_x | Metadata_number_of_singlecells | Image_Metadata_Site_x | Image_Metadata_Well_x | Metadata_Cells_Number_Object_Number | Metadata_Cytoplasm_Parent_Cells | Metadata_Cytoplasm_Parent_Nuclei | Metadata_Nuclei_Number_Object_Number | Cytoplasm_AreaShape_Area | ... | Image_Threshold_SumOfEntropies_Cells | Image_Threshold_SumOfEntropies_Nuclei | Image_Threshold_WeightedVariance_Cells | Image_Threshold_WeightedVariance_Nuclei | Image_URL_DAPI | Image_URL_GFP | Image_URL_RFP | Image_Width_DAPI | Image_Width_GFP | Image_Width_RFP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Plate_2 | 44 | 1 | A12 | 1 | 1 | 2 | 2 | 21024.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1 | 1 | Plate_2 | 44 | 1 | A12 | 4 | 4 | 7 | 7 | 12754.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
2 | 1 | Plate_2 | 44 | 1 | A12 | 7 | 7 | 10 | 10 | 23976.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
3 | 1 | Plate_2 | 44 | 1 | A12 | 8 | 8 | 12 | 12 | 19374.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
4 | 1 | Plate_2 | 44 | 1 | A12 | 9 | 9 | 13 | 13 | 27385.0 | ... | -12.181288 | -11.699993 | 0.992624 | 0.657791 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1709 | 128 | Plate_2 | 59 | 4 | H7 | 10 | 10 | 14 | 14 | 24942.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1710 | 128 | Plate_2 | 59 | 4 | H7 | 11 | 11 | 15 | 15 | 6627.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1711 | 128 | Plate_2 | 59 | 4 | H7 | 12 | 12 | 16 | 16 | 11216.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1712 | 128 | Plate_2 | 59 | 4 | H7 | 13 | 13 | 17 | 17 | 15279.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1713 | 128 | Plate_2 | 59 | 4 | H7 | 14 | 14 | 20 | 20 | 7106.0 | ... | -12.566582 | -11.633043 | 1.624310 | 0.545186 | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | file:/home/jenna/nf1_cellpainting_data/1.cellp... | 1224 | 1224 | 1224 |
1714 rows × 2076 columns