Python API#

Top-level API#

Public package interface for iceberg_bioimage.

class iceberg_bioimage.CatalogScanOptions(columns: Sequence[str] | None = None, where: str | None = None, snapshot_id: int | None = None, limit: int | None = None)[source]#

Options for scanning a catalog-backed metadata table.

class iceberg_bioimage.ContractValidationResult(target: str, present_columns: list[str], required_columns: list[str], recommended_columns: list[str], missing_required_columns: list[str], missing_recommended_columns: list[str], warnings: list[str] = <factory>)[source]#

Serializable result for schema-level contract validation.

property is_valid: bool#

Return whether all required columns are present.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the validation result to JSON.

class iceberg_bioimage.CytominingWarehouseResult(warehouse_root: str, tables_written: list[str], row_counts: dict[str, int], manifest_path: str | None = None)[source]#

Serializable result for exporting Parquet-backed Cytomining warehouses.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the Cytomining warehouse export result to JSON.

class iceberg_bioimage.DatasetSummary(source_uri: str, format_family: str, image_asset_count: int, chunked_asset_count: int, array_paths: list[str], dtypes: list[str], shapes: list[list[int]], axes: list[str], channel_counts: list[int], storage_variants: list[str], warnings: list[str] = <factory>)[source]#

User-facing summary of a scanned dataset.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the dataset summary to JSON.

class iceberg_bioimage.ImageAsset(uri: str, shape: list[int], dtype: str, array_path: str | None = None, chunk_shape: list[int] | None = None, metadata: dict[str, ~typing.Any] = <factory>, image_id: str | None = None)[source]#

Canonical representation of one discovered image asset.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

class iceberg_bioimage.RegistrationResult(source_uri: str, image_assets_rows_published: int, chunk_rows_published: int)[source]#

Serializable result for a metadata registration workflow.

to_dict() dict[str, int | str][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the registration result to JSON.

class iceberg_bioimage.ScanResult(source_uri: str, format_family: str, image_assets: list[~iceberg_bioimage.models.scan_result.ImageAsset], warnings: list[str] = <factory>)[source]#

Canonical scan output shared across adapters and publishers.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the scan result to JSON.

class iceberg_bioimage.WarehouseIngestResult(catalog: str, namespace: list[str], image_assets_table: str, chunk_index_table: str | None, datasets: list[~iceberg_bioimage.models.scan_result.RegistrationResult], warnings: list[str] = <factory>)[source]#

Serializable result for a multi-dataset warehouse ingestion workflow.

property chunk_rows_published: int#

Return the total number of published chunk-index rows.

property dataset_count: int#

Return the number of ingested datasets.

property image_assets_rows_published: int#

Return the total number of published image-assets rows.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse ingestion result to JSON.

class iceberg_bioimage.WarehouseManifest(warehouse_root: str, tables: list[~iceberg_bioimage.models.scan_result.WarehouseTableManifestEntry] = <factory>)[source]#

Serializable manifest describing tables stored in a warehouse root.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse manifest to JSON.

class iceberg_bioimage.WarehouseTableManifestEntry(table_name: str, role: str, format: str = 'parquet', join_keys: list[str] = <factory>, source_type: str | None = None, source_ref: str | None = None, row_count: int | None = None, columns: list[str] = <factory>)[source]#

Serializable metadata for one table in a warehouse manifest.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

class iceberg_bioimage.WarehouseValidationResult(warehouse_root: str, errors: list[str] = <factory>, warnings: list[str] = <factory>)[source]#

Serializable result for validating a warehouse manifest and layout.

property is_valid: bool#

Return whether the warehouse passed validation.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse validation result to JSON.

iceberg_bioimage.catalog_table_to_arrow(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], table_name: str, *, scan_options: CatalogScanOptions | None = None) Table[source]#

Load a catalog table into Arrow via PyIceberg.

iceberg_bioimage.create_duckdb_connection(database: str = ':memory:', *, read_only: bool = False) DuckDBPyConnection[source]#

Create a DuckDB connection.

DuckDB is optional for this project. This helper isolates the import so the core package remains engine-neutral unless the user explicitly opts in.

iceberg_bioimage.create_ome_arrow(data: Any, **kwargs: Any) object[source]#

Create an ome_arrow.OMEArrow object when the optional extra is installed.

iceberg_bioimage.export_catalog_to_cytomining_warehouse(catalog: str | SupportsScanCatalog, namespace: str | tuple[str, ...], warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str | None = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Materialize catalog-backed metadata into a Parquet Cytomining warehouse.

iceberg_bioimage.export_profiles_to_cytomining_warehouse(profiles: str | Path | Table | list[dict[str, object]], warehouse_root: str | Path, *, table_name: str = 'profiles', role: str = 'profiles', profile_dataset_id: str | None = None, join_keys: list[str] | None = None, source_type: str = 'profiles', source_ref: str | None = None, alias_map: Mapping[str, tuple[str, ...] | list[str]] | None = None, mode: Literal['overwrite', 'append'] = 'append') CytominingWarehouseResult[source]#

Write a Cytomining profile table into a Parquet-backed warehouse root.

iceberg_bioimage.export_scan_result_to_cytomining_warehouse(scan_result: ScanResult, warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, include_chunks: bool = True, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Write scan-derived metadata into a Parquet-backed Cytomining warehouse.

iceberg_bioimage.export_store_to_cytomining_warehouse(uri: str, warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, include_chunks: bool = True, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Scan a store and export its metadata into a Cytomining warehouse.

iceberg_bioimage.export_table_to_cytomining_warehouse(table: Table, warehouse_root: str | Path, *, table_name: str, role: str, join_keys: list[str] | None = None, source_type: str | None = None, source_ref: str | None = None, mode: Literal['overwrite', 'append'] = 'append') CytominingWarehouseResult[source]#

Write a generic table into a warehouse root and update the manifest.

iceberg_bioimage.ingest_scan_results_to_warehouse(scan_results: Sequence[ScanResult], catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') WarehouseIngestResult[source]#

Publish many scanned datasets into a Cytotable-compatible warehouse.

iceberg_bioimage.ingest_stores_to_warehouse(uris: Sequence[str], catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') WarehouseIngestResult[source]#

Scan and publish many datasets into a Cytotable-compatible warehouse.

iceberg_bioimage.join_catalog_image_assets_with_profiles(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], profiles: str | Path | Table | list[dict[str, object]], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = None, join_keys: Sequence[str] = DEFAULT_JOIN_KEYS, image_assets_scan_options: CatalogScanOptions | None = None, chunk_index_scan_options: CatalogScanOptions | None = None, profile_dataset_id: str | None = None) Table[source]#

Join catalog-backed image metadata to a profile table.

Parameters:
  • catalog – Catalog name or catalog-like object.

  • namespace – Namespace containing the metadata tables.

  • profiles – Profile rows or table to join against.

  • image_assets_table – Name of the canonical image-assets table.

  • chunk_index_table – Optional chunk-index table name.

  • join_keys – Join columns shared by image metadata and profiles.

  • image_assets_scan_options – Optional scan options for image-assets reads.

  • chunk_index_scan_options – Optional scan options for chunk-index reads.

  • profile_dataset_id – Dataset identifier to inject for profile inputs that do not carry their own dataset_id column. Defaults to None.

iceberg_bioimage.join_image_assets_with_profiles(image_assets: MetadataSource, profiles: MetadataSource, *, join_keys: Sequence[str] = DEFAULT_JOIN_KEYS, chunk_index: MetadataSource | None = None, connection: DuckDBPyConnection | None = None, profile_dataset_id: str | None = None) pa.Table[source]#

Join image metadata to a profile table using the canonical join keys.

If profile_dataset_id is provided, that value is used to populate the profile-side dataset_id when the profile input lacks one. When it is None, the profile input is expected to carry dataset_id already.

iceberg_bioimage.join_profiles_with_scan_result(scan_result: ScanResult, profiles: str | Path | Table | list[dict[str, object]], *, include_chunks: bool = False, profile_dataset_id: str | None = None) Table[source]#

Join canonical image assets from a scan result to profile rows.

This helper uses the optional DuckDB integration at runtime. Install the duckdb extra/group before calling it.

iceberg_bioimage.join_profiles_with_store(uri: str, profiles: str | Path | Table | list[dict[str, object]], *, include_chunks: bool = False, profile_dataset_id: str | None = None) Table[source]#

Scan a store and join its canonical image assets to profile rows.

This helper uses the optional DuckDB integration at runtime. Install the duckdb extra/group before calling it.

iceberg_bioimage.list_catalog_tables(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str]) list[str][source]#

List canonical metadata tables available in a catalog namespace.

iceberg_bioimage.load_catalog_table(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], table_name: str) SupportsIcebergTable[source]#

Load a canonical metadata table from a catalog.

iceberg_bioimage.load_profile_column_aliases(path: str | Path) dict[str, tuple[str, ...]][source]#

Load microscopy profile column aliases from a TOML file.

iceberg_bioimage.load_warehouse_manifest(warehouse_root: str | Path) WarehouseManifest[source]#

Load a warehouse manifest if present, otherwise return an empty manifest.

iceberg_bioimage.publish_chunk_index(catalog: str | SupportsCatalog, namespace: str | Iterable[str], table_name: str, scan_result: ScanResult) int[source]#

Publish derived chunk metadata into the canonical chunk_index table.

iceberg_bioimage.publish_image_assets(catalog: str | SupportsCatalog, namespace: str | Iterable[str], table_name: str, scan_result: ScanResult) int[source]#

Publish a scan result into the canonical image_assets Iceberg table.

iceberg_bioimage.query_metadata_table(source: MetadataSource, *, columns: Sequence[str] | None = None, filters: Sequence[FilterClause] | None = None, connection: DuckDBPyConnection | None = None) pa.Table[source]#

Query a metadata table from a Parquet path, Arrow table, or row list.

iceberg_bioimage.register_store(uri: str, catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') RegistrationResult[source]#

Scan a store and publish canonical metadata tables.

iceberg_bioimage.scan_ome_arrow(data: str, **kwargs: Any) object[source]#

Create a lazy ome_arrow.OMEArrow scan plan for tabular image sources.

iceberg_bioimage.scan_store(uri: str) ScanResult[source]#

Scan a supported image store and return canonical metadata.

iceberg_bioimage.summarize_scan_result(scan_result: ScanResult) DatasetSummary[source]#

Build a concise user-facing summary from a scan result.

iceberg_bioimage.summarize_store(uri: str) DatasetSummary[source]#

Scan a store and return a concise dataset summary.

iceberg_bioimage.validate_microscopy_profile_columns(columns: list[str] | tuple[str, ...], *, target: str = 'profile_table', alias_map: Mapping[str, tuple[str, ...] | list[str]] | None = None) ContractValidationResult[source]#

Validate a schema against the microscopy join contract.

iceberg_bioimage.validate_microscopy_profile_table(path: str) ContractValidationResult[source]#

Validate a local profile table file against the microscopy join contract.

iceberg_bioimage.validate_warehouse_manifest(path: str | Path) WarehouseValidationResult[source]#

Validate a manifest-backed warehouse root.

Scan API#

Public API entry points.

iceberg_bioimage.api.ingest_scan_results_to_warehouse(scan_results: Sequence[ScanResult], catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') WarehouseIngestResult[source]#

Publish many scanned datasets into a Cytotable-compatible warehouse.

iceberg_bioimage.api.ingest_stores_to_warehouse(uris: Sequence[str], catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') WarehouseIngestResult[source]#

Scan and publish many datasets into a Cytotable-compatible warehouse.

iceberg_bioimage.api.join_profiles_with_scan_result(scan_result: ScanResult, profiles: str | Path | Table | list[dict[str, object]], *, include_chunks: bool = False, profile_dataset_id: str | None = None) Table[source]#

Join canonical image assets from a scan result to profile rows.

This helper uses the optional DuckDB integration at runtime. Install the duckdb extra/group before calling it.

iceberg_bioimage.api.join_profiles_with_store(uri: str, profiles: str | Path | Table | list[dict[str, object]], *, include_chunks: bool = False, profile_dataset_id: str | None = None) Table[source]#

Scan a store and join its canonical image assets to profile rows.

This helper uses the optional DuckDB integration at runtime. Install the duckdb extra/group before calling it.

iceberg_bioimage.api.register_store(uri: str, catalog: str | SupportsCatalog, namespace: str | Sequence[str], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = 'chunk_index') RegistrationResult[source]#

Scan a store and publish canonical metadata tables.

iceberg_bioimage.api.scan_store(uri: str) ScanResult[source]#

Scan a supported image store and return canonical metadata.

iceberg_bioimage.api.summarize_scan_result(scan_result: ScanResult) DatasetSummary[source]#

Build a concise user-facing summary from a scan result.

iceberg_bioimage.api.summarize_store(uri: str) DatasetSummary[source]#

Scan a store and return a concise dataset summary.

Models#

Serializable canonical scan models.

class iceberg_bioimage.models.scan_result.ContractValidationResult(target: str, present_columns: list[str], required_columns: list[str], recommended_columns: list[str], missing_required_columns: list[str], missing_recommended_columns: list[str], warnings: list[str] = <factory>)[source]#

Serializable result for schema-level contract validation.

property is_valid: bool#

Return whether all required columns are present.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the validation result to JSON.

class iceberg_bioimage.models.scan_result.CytominingWarehouseResult(warehouse_root: str, tables_written: list[str], row_counts: dict[str, int], manifest_path: str | None = None)[source]#

Serializable result for exporting Parquet-backed Cytomining warehouses.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the Cytomining warehouse export result to JSON.

class iceberg_bioimage.models.scan_result.DatasetSummary(source_uri: str, format_family: str, image_asset_count: int, chunked_asset_count: int, array_paths: list[str], dtypes: list[str], shapes: list[list[int]], axes: list[str], channel_counts: list[int], storage_variants: list[str], warnings: list[str] = <factory>)[source]#

User-facing summary of a scanned dataset.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the dataset summary to JSON.

class iceberg_bioimage.models.scan_result.ImageAsset(uri: str, shape: list[int], dtype: str, array_path: str | None = None, chunk_shape: list[int] | None = None, metadata: dict[str, ~typing.Any] = <factory>, image_id: str | None = None)[source]#

Canonical representation of one discovered image asset.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

class iceberg_bioimage.models.scan_result.RegistrationResult(source_uri: str, image_assets_rows_published: int, chunk_rows_published: int)[source]#

Serializable result for a metadata registration workflow.

to_dict() dict[str, int | str][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the registration result to JSON.

class iceberg_bioimage.models.scan_result.ScanResult(source_uri: str, format_family: str, image_assets: list[~iceberg_bioimage.models.scan_result.ImageAsset], warnings: list[str] = <factory>)[source]#

Canonical scan output shared across adapters and publishers.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the scan result to JSON.

class iceberg_bioimage.models.scan_result.WarehouseIngestResult(catalog: str, namespace: list[str], image_assets_table: str, chunk_index_table: str | None, datasets: list[~iceberg_bioimage.models.scan_result.RegistrationResult], warnings: list[str] = <factory>)[source]#

Serializable result for a multi-dataset warehouse ingestion workflow.

property chunk_rows_published: int#

Return the total number of published chunk-index rows.

property dataset_count: int#

Return the number of ingested datasets.

property image_assets_rows_published: int#

Return the total number of published image-assets rows.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse ingestion result to JSON.

class iceberg_bioimage.models.scan_result.WarehouseManifest(warehouse_root: str, tables: list[~iceberg_bioimage.models.scan_result.WarehouseTableManifestEntry] = <factory>)[source]#

Serializable manifest describing tables stored in a warehouse root.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse manifest to JSON.

class iceberg_bioimage.models.scan_result.WarehouseTableManifestEntry(table_name: str, role: str, format: str = 'parquet', join_keys: list[str] = <factory>, source_type: str | None = None, source_ref: str | None = None, row_count: int | None = None, columns: list[str] = <factory>)[source]#

Serializable metadata for one table in a warehouse manifest.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

class iceberg_bioimage.models.scan_result.WarehouseValidationResult(warehouse_root: str, errors: list[str] = <factory>, warnings: list[str] = <factory>)[source]#

Serializable result for validating a warehouse manifest and layout.

property is_valid: bool#

Return whether the warehouse passed validation.

to_dict() dict[str, Any][source]#

Return a JSON-serializable representation.

to_json(**json_kwargs: Any) str[source]#

Serialize the warehouse validation result to JSON.

Publishing#

Image asset publishing helpers.

class iceberg_bioimage.publishing.image_assets.SupportsAppend(*args, **kwargs)[source]#

Protocol for appendable Iceberg-like tables.

append(table: Table) None[source]#

Append a pyarrow table.

class iceberg_bioimage.publishing.image_assets.SupportsCatalog(*args, **kwargs)[source]#

Protocol for catalog objects used by the publishing layer.

create_table(identifier: tuple[str, ...], schema: object) SupportsAppend[source]#

Create and return a table.

load_table(identifier: tuple[str, ...]) SupportsAppend[source]#

Load an existing table.

class iceberg_bioimage.publishing.image_assets.SupportsLoadTable(*args, **kwargs)[source]#

Protocol for catalog objects that can load existing tables.

load_table(identifier: tuple[str, ...]) TTable[source]#

Load an existing table.

iceberg_bioimage.publishing.image_assets.publish_image_assets(catalog: str | SupportsCatalog, namespace: str | Iterable[str], table_name: str, scan_result: ScanResult) int[source]#

Publish a scan result into the canonical image_assets Iceberg table.

iceberg_bioimage.publishing.image_assets.scan_result_to_rows(scan_result: ScanResult) list[dict[str, object]][source]#

Convert a scan result into canonical image_assets rows.

Chunk index publishing helpers.

iceberg_bioimage.publishing.chunk_index.publish_chunk_index(catalog: str | SupportsCatalog, namespace: str | Iterable[str], table_name: str, scan_result: ScanResult) int[source]#

Publish derived chunk metadata into the canonical chunk_index table.

iceberg_bioimage.publishing.chunk_index.scan_result_to_chunk_rows(scan_result: ScanResult) list[dict[str, object]][source]#

Convert a scan result into canonical chunk_index rows.

Validation#

Validation helpers for canonical scan objects and join contracts.

iceberg_bioimage.validation.contracts.load_profile_column_aliases(path: str | Path) dict[str, tuple[str, ...]][source]#

Load microscopy profile column aliases from a TOML file.

iceberg_bioimage.validation.contracts.profile_column_aliases() Mapping[str, tuple[str, ...]][source]#

Return the supported microscopy profile column aliases.

iceberg_bioimage.validation.contracts.raise_for_invalid_scan_result(scan_result: ScanResult) None[source]#

Raise a ValueError when a scan result is invalid.

iceberg_bioimage.validation.contracts.resolve_microscopy_profile_columns(columns: list[str] | tuple[str, ...], *, alias_map: Mapping[str, tuple[str, ...] | list[str]] | None = None) dict[str, str | None][source]#

Resolve canonical microscopy columns from a schema with known aliases.

iceberg_bioimage.validation.contracts.validate_microscopy_profile_columns(columns: list[str] | tuple[str, ...], *, target: str = 'profile_table', alias_map: Mapping[str, tuple[str, ...] | list[str]] | None = None) ContractValidationResult[source]#

Validate a schema against the microscopy join contract.

iceberg_bioimage.validation.contracts.validate_microscopy_profile_table(path: str) ContractValidationResult[source]#

Validate a local profile table file against the microscopy join contract.

iceberg_bioimage.validation.contracts.validate_scan_result(scan_result: ScanResult) list[str][source]#

Return validation errors for a scan result.

iceberg_bioimage.validation.contracts.validate_warehouse_manifest(path: str | Path) WarehouseValidationResult[source]#

Validate a manifest-backed warehouse root.

Optional DuckDB Integration#

Optional DuckDB query helpers for canonical metadata tables.

iceberg_bioimage.integrations.duckdb.create_duckdb_connection(database: str = ':memory:', *, read_only: bool = False) DuckDBPyConnection[source]#

Create a DuckDB connection.

DuckDB is optional for this project. This helper isolates the import so the core package remains engine-neutral unless the user explicitly opts in.

iceberg_bioimage.integrations.duckdb.join_image_assets_with_profiles(image_assets: MetadataSource, profiles: MetadataSource, *, join_keys: Sequence[str] = DEFAULT_JOIN_KEYS, chunk_index: MetadataSource | None = None, connection: DuckDBPyConnection | None = None, profile_dataset_id: str | None = None) pa.Table[source]#

Join image metadata to a profile table using the canonical join keys.

If profile_dataset_id is provided, that value is used to populate the profile-side dataset_id when the profile input lacks one. When it is None, the profile input is expected to carry dataset_id already.

iceberg_bioimage.integrations.duckdb.query_metadata_table(source: MetadataSource, *, columns: Sequence[str] | None = None, filters: Sequence[FilterClause] | None = None, connection: DuckDBPyConnection | None = None) pa.Table[source]#

Query a metadata table from a Parquet path, Arrow table, or row list.

Catalog-facing helpers for reading canonical Iceberg metadata tables.

class iceberg_bioimage.integrations.catalog.CatalogScanOptions(columns: Sequence[str] | None = None, where: str | None = None, snapshot_id: int | None = None, limit: int | None = None)[source]#

Options for scanning a catalog-backed metadata table.

class iceberg_bioimage.integrations.catalog.SupportsIcebergScan(*args, **kwargs)[source]#

Protocol for pyiceberg scan objects.

to_arrow() Table[source]#

Materialize the scan as an Arrow table.

class iceberg_bioimage.integrations.catalog.SupportsIcebergTable(*args, **kwargs)[source]#

Protocol for pyiceberg table objects.

scan(row_filter: str = 'True', selected_fields: tuple[str, ...] = ('*',), case_sensitive: bool = True, snapshot_id: int | None = None, limit: int | None = None) SupportsIcebergScan[source]#

Return a scan object for the current table.

class iceberg_bioimage.integrations.catalog.SupportsScanCatalog(*args, **kwargs)[source]#

Protocol for catalogs used by the read-only integration helpers.

list_tables(namespace: tuple[str, ...]) list[tuple[str, ...]][source]#

List tables within a namespace.

load_table(identifier: tuple[str, ...]) SupportsIcebergTable[source]#

Load an existing Iceberg table.

iceberg_bioimage.integrations.catalog.catalog_table_to_arrow(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], table_name: str, *, scan_options: CatalogScanOptions | None = None) Table[source]#

Load a catalog table into Arrow via PyIceberg.

iceberg_bioimage.integrations.catalog.join_catalog_image_assets_with_profiles(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], profiles: str | Path | Table | list[dict[str, object]], *, image_assets_table: str = 'image_assets', chunk_index_table: str | None = None, join_keys: Sequence[str] = DEFAULT_JOIN_KEYS, image_assets_scan_options: CatalogScanOptions | None = None, chunk_index_scan_options: CatalogScanOptions | None = None, profile_dataset_id: str | None = None) Table[source]#

Join catalog-backed image metadata to a profile table.

Parameters:
  • catalog – Catalog name or catalog-like object.

  • namespace – Namespace containing the metadata tables.

  • profiles – Profile rows or table to join against.

  • image_assets_table – Name of the canonical image-assets table.

  • chunk_index_table – Optional chunk-index table name.

  • join_keys – Join columns shared by image metadata and profiles.

  • image_assets_scan_options – Optional scan options for image-assets reads.

  • chunk_index_scan_options – Optional scan options for chunk-index reads.

  • profile_dataset_id – Dataset identifier to inject for profile inputs that do not carry their own dataset_id column. Defaults to None.

iceberg_bioimage.integrations.catalog.list_catalog_tables(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str]) list[str][source]#

List canonical metadata tables available in a catalog namespace.

iceberg_bioimage.integrations.catalog.load_catalog_table(catalog: str | SupportsScanCatalog, namespace: str | Sequence[str], table_name: str) SupportsIcebergTable[source]#

Load a canonical metadata table from a catalog.

Helpers for exporting Parquet-backed Cytomining warehouse layouts.

iceberg_bioimage.integrations.cytomining.export_catalog_to_cytomining_warehouse(catalog: str | SupportsScanCatalog, namespace: str | tuple[str, ...], warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str | None = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Materialize catalog-backed metadata into a Parquet Cytomining warehouse.

iceberg_bioimage.integrations.cytomining.export_profiles_to_cytomining_warehouse(profiles: str | Path | Table | list[dict[str, object]], warehouse_root: str | Path, *, table_name: str = 'profiles', role: str = 'profiles', profile_dataset_id: str | None = None, join_keys: list[str] | None = None, source_type: str = 'profiles', source_ref: str | None = None, alias_map: Mapping[str, tuple[str, ...] | list[str]] | None = None, mode: Literal['overwrite', 'append'] = 'append') CytominingWarehouseResult[source]#

Write a Cytomining profile table into a Parquet-backed warehouse root.

iceberg_bioimage.integrations.cytomining.export_scan_result_to_cytomining_warehouse(scan_result: ScanResult, warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, include_chunks: bool = True, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Write scan-derived metadata into a Parquet-backed Cytomining warehouse.

iceberg_bioimage.integrations.cytomining.export_store_to_cytomining_warehouse(uri: str, warehouse_root: str | Path, *, profiles: str | Path | Table | list[dict[str, object]] | None = None, include_chunks: bool = True, image_assets_table_name: str = 'image_assets', chunk_index_table_name: str = 'chunk_index', joined_table_name: str = 'joined_profiles', profile_dataset_id: str | None = None, mode: Literal['overwrite', 'append'] = 'overwrite') CytominingWarehouseResult[source]#

Scan a store and export its metadata into a Cytomining warehouse.

iceberg_bioimage.integrations.cytomining.export_table_to_cytomining_warehouse(table: Table, warehouse_root: str | Path, *, table_name: str, role: str, join_keys: list[str] | None = None, source_type: str | None = None, source_ref: str | None = None, mode: Literal['overwrite', 'append'] = 'append') CytominingWarehouseResult[source]#

Write a generic table into a warehouse root and update the manifest.

iceberg_bioimage.integrations.cytomining.load_warehouse_manifest(warehouse_root: str | Path) WarehouseManifest[source]#

Load a warehouse manifest if present, otherwise return an empty manifest.

Optional OME-Arrow Integration#

Optional OME-Arrow integration helpers.

iceberg_bioimage.integrations.ome_arrow.create_ome_arrow(data: Any, **kwargs: Any) object[source]#

Create an ome_arrow.OMEArrow object when the optional extra is installed.

iceberg_bioimage.integrations.ome_arrow.scan_ome_arrow(data: str, **kwargs: Any) object[source]#

Create a lazy ome_arrow.OMEArrow scan plan for tabular image sources.