Processing Functions¶

`add_b_score(plate_df, measurement_header='Raw Optical Density', row_header='Row_384', col_header='Col_384')` ¶

Expects a Dataframe comprising a whole plate (without controls!).

Source code in rda_toolbox/process.py

def add_b_score(
    plate_df: pd.DataFrame,
    measurement_header: str = "Raw Optical Density",
    row_header: str = "Row_384",
    col_header: str = "Col_384",
) -> pd.DataFrame:
    """
    Expects a Dataframe comprising a **whole** plate (without controls!).
    """
    # We could also collect iterations of the median polish function and plot the results to show progress of normalization
    plate_df = median_polish_df(plate_df)
    mad_value = median_absolute_deviation(plate_df[measurement_header], scale=1.4826)
    plate_df["b_scores"] = plate_df[measurement_header] / mad_value
    return plate_df.drop(
        columns=["row_effect", "col_effect", "row_median", "col_median", measurement_header]
        ).round({"b_scores": 2})

`background_normalize_zfactor(grp, substance_id, measurement, negative_controls, blanks, norm_by_barcode)` ¶

This function is supposed to be applied to a grouped DataFrame. It does the following operations: - Background subtraction by subtracting the mean of the blanks per plate - Normalization by applying max-normalization using the 'Negative Controls' - Z-Factor calculation using negative controls and blanks

negative_controls are controls with organism (e.g. bacteria) and medium and are labeled in the input DataFrame as 'Negative Controls'. blanks are controls with only medium and are labeled in the input DataFrame as 'Medium'.

Source code in rda_toolbox/process.py

def background_normalize_zfactor(
    grp: pd.DataFrame,
    substance_id,
    measurement,
    negative_controls,
    blanks,
    norm_by_barcode,
) -> pd.DataFrame:
    """
    This function is supposed to be applied to a grouped DataFrame.
    It does the following operations:
    - Background subtraction by subtracting the mean of the blanks per plate
    - Normalization by applying max-normalization using the 'Negative Controls'
    - Z-Factor calculation using negative controls and blanks

    *`negative_controls` are controls with organism (e.g. bacteria) and medium*
    *and are labeled in the input DataFrame as 'Negative Controls'.*
    *`blanks` are controls with only medium and are labeled*
    *in the input DataFrame as 'Medium'.*
    """

    plate_blanks_mean = grp[grp[substance_id] == blanks][f"Raw {measurement}"].mean()
    # Subtract background noise:
    grp[f"Denoised {measurement}"] = grp[f"Raw {measurement}"] - plate_blanks_mean
    plate_denoised_negative_mean = grp[grp[substance_id] == negative_controls][
        f"Denoised {measurement}"
    ].mean()
    plate_denoised_blank_mean = grp[grp[substance_id] == blanks][
        f"Denoised {measurement}"
    ].mean()
    # Normalize:
    grp[f"Relative {measurement}"] = grp[f"Denoised {measurement}"].apply(
        lambda x: max_normalization(x, plate_denoised_negative_mean)
    )
    # Z-Factor:
    plate_neg_controls = grp[grp[substance_id] == negative_controls][
        f"Raw {measurement}"
    ]
    plate_blank_controls = grp[grp[substance_id] == blanks][f"Raw {measurement}"]

    # Check inputs :)
    if (len(plate_neg_controls) == 0):
        raise KeyError("Please check if keyword 'negative_controls' is matching with input table.")
    elif (len(plate_blank_controls) == 0):
        raise KeyError("Please check if keyword 'blanks' is matching with input table.")

    grp["Z-Factor"] = zfactor(plate_neg_controls, plate_blank_controls)

    # Robust Z-Factor using median instead of mean:
    grp["Robust Z-Factor"] = zfactor_median(plate_neg_controls, plate_blank_controls)

    return grp

`get_thresholded_subset(df, id_column='ID', negative_controls='Negative Control', blanks='Medium', blankplate_organism='Blank', threshold=None)` ¶

Expects a DataFrame with a mic_cutoff column

Source code in rda_toolbox/process.py

def get_thresholded_subset(
    df: pd.DataFrame,
    id_column="ID",
    negative_controls: str = "Negative Control",
    blanks: str = "Medium",
    blankplate_organism: str = "Blank",
    threshold=None,
) -> pd.DataFrame:
    """
    Expects a DataFrame with a mic_cutoff column
    """
    # TODO: hardcode less columns

    # Use only substance entries, no controls, no blanks etc.:
    substance_df = df.loc[
        (df[id_column] != blanks)
        & (df[id_column] != negative_controls)
        & (df["Organism"] != blankplate_organism),
        :,
    ].copy()
    # Apply threshold:
    if threshold:
        substance_df["Cutoff"] = threshold
    else:
        if "mic_cutoff" not in substance_df:
            raise KeyError("No 'mic_cutoff' column in Input.xlsx")
    selection = substance_df[
        substance_df["Relative Optical Density"] < substance_df["Cutoff"]
    ]
    # Apply mean and std in case of replicates:
    result = selection.groupby([id_column, "Organism", "Dataset"], as_index=False).agg(
        {
            "Relative Optical Density": ["mean", "std"],
            id_column: ["first", "count"],
            "Organism": "first",
            "Cutoff": "first",
            "Dataset": "first",
        }
    )
    result.columns = [
        "Relative Optical Density mean",
        "Relative Optical Density std",
        id_column,
        "Replicates",
        "Organism",
        "Cutoff",
        "Dataset",
    ]
    return result

`mic_results(df, filepath, thresholds=[20, 50])` ¶

Expects the results from rda.preprocess() function. Means measurements between replicates and obtains the MIC values per substance and organism. Saves excel files per dataset and sheets per organism with Minimum Inhibitory Concentrations (MICs) at the given thresholds.

Source code in rda_toolbox/process.py

def mic_results(df, filepath, thresholds=[20, 50]):
    """
    Expects the results from rda.preprocess() function.
    Means measurements between replicates and obtains the MIC values per substance and organism.
    Saves excel files per dataset and sheets per organism with Minimum Inhibitory Concentrations (MICs)
    at the given thresholds.
    """

    df = df[(df["Dataset"] != "Negative Control") & (df["Dataset"] != "Blank")].dropna(
        subset=["Concentration"]
    )
    # the above should remove entries where Concentration == NAN

    # Pivot table to get the aggregated values:
    pivot_df = pd.pivot_table(
        df,
        values=["Relative Optical Density", "Replicate", "Z-Factor"],
        index=[
            "Internal ID",
            "External ID",
            "Organism",
            "Concentration",
            "Dataset",
        ],
        aggfunc={
            "Relative Optical Density": ["mean"],
            "Replicate": ["count"],
            "Z-Factor": ["mean", "std"],  # does this make sense? with std its usable.
            # "Z-Factor": ["std"],
        },
    ).reset_index()

    # merge pandas hirarchical column index (wtf is this pandas!?)
    pivot_df.columns = [" ".join(x).strip() for x in pivot_df.columns.ravel()]

    mic_records = []
    for group_names, grp in pivot_df.groupby(
        ["Internal ID", "External ID", "Organism", "Dataset"]
    ):
        internal_id, external_id, organism, dataset = group_names
        # Sort by concentration just to be sure:
        grp = grp[
            [
                "Concentration",
                "Relative Optical Density mean",
                "Z-Factor mean",
                "Z-Factor std",
            ]
        ].sort_values(by=["Concentration"])
        # print(grp)
        # Get rows where the OD is below the given threshold:
        record = {
            "Internal ID": internal_id,
            "External ID": external_id,
            "Organism": organism,
            "Dataset": dataset,
            "Z-Factor mean": list(grp["Z-Factor mean"])[0],
            "Z-Factor std": list(grp["Z-Factor std"])[0],
        }

        for threshold in thresholds:
            values_below_threshold = grp[
                grp["Relative Optical Density mean"] < threshold
            ]
            # thx to jonathan - check if the OD at maximum concentration is below threshold (instead of any concentration)
            max_conc_below_threshold = list(
                grp[grp["Concentration"] == max(grp["Concentration"])][
                    "Relative Optical Density mean"
                ]
                < threshold
            )[0]
            if not max_conc_below_threshold:
                mic = None
            else:
                mic = values_below_threshold.iloc[0]["Concentration"]
            record[f"MIC{threshold} in µM"] = mic
        mic_records.append(record)
    # Drop entries where no MIC could be determined
    mic_df = pd.DataFrame.from_records(mic_records)
    # mic_df.dropna(
    #     subset=[f"MIC{threshold} in µM" for threshold in thresholds],
    #     how="all",
    #     inplace=True,
    # )
    mic_df.round(2).to_excel(
        os.path.join(filepath, "MIC_Results_AllDatasets_longformat.xlsx"), index=False
    )
    for dataset, dataset_grp in mic_df.groupby(["Dataset"]):
        pivot_multiindex_df = pd.pivot_table(
            dataset_grp,
            values=[f"MIC{threshold} in µM" for threshold in thresholds]
            + ["Z-Factor mean", "Z-Factor std"],
            index=["Internal ID", "External ID", "Dataset"],
            columns="Organism",
        ).reset_index()

        resultpath = os.path.join(filepath, dataset[0])

        # References special case
        if dataset[0] == "Reference":
            references_mic_results(df, resultpath, thresholds=thresholds)
            continue  # skip for references

        pathlib.Path(resultpath).mkdir(parents=True, exist_ok=True)
        for threshold in thresholds:
            organisms_thresholded_mics = pivot_multiindex_df[
                ["Internal ID", "External ID", f"MIC{threshold} in µM"]
            ]
            cols = list(organisms_thresholded_mics.columns.droplevel())
            cols[0] = "Internal ID"
            cols[1] = "External ID"
            organisms_thresholded_mics.columns = cols
            organisms_thresholded_mics = organisms_thresholded_mics.sort_values(
                by=list(organisms_thresholded_mics.columns)[2:],
                na_position="last",
            )
            # organisms_thresholded_mics.dropna(
            #     subset=list(organisms_thresholded_mics.columns)[2:],
            #     how="all",
            #     inplace=True,
            # )
            organisms_thresholded_mics.fillna("NA", inplace=True)
            organisms_thresholded_mics.to_excel(
                os.path.join(resultpath, f"{dataset[0]}_MIC{threshold}_results.xlsx"),
                index=False,
            )

`preprocess(df, substance_id='ID', measurement='Optical Density', negative_controls='Negative Control', blanks='Blank', norm_by_barcode='Barcode')` ¶

raw_df: raw reader data obtained with rda.readerfiles_rawdf()
input_df: input specifications table with required columns:
- Dataset (with specified references as their own dataset 'Reference')
- ID (substance_id) (with specified blanks and negative_controls)
- Assay Transfer Barcode
- Row_384 (or Row_96)
- Col_384 (or Col_96)
- Concentration
- Replicate (specifying replicate number)
- Organism (scientific organism name i.e. with strain)

Processing function which merges raw reader data (raw_df) with input specifications table (input_df) and then normalizes, calculates Z-Factor per plate (norm_by_barcode) and rounds to sensible decimal places.

Source code in rda_toolbox/process.py

def preprocess(
    df: pd.DataFrame,  # mapped inputs
    substance_id: str = "ID",
    measurement: str = "Optical Density",
    negative_controls: str = "Negative Control",
    blanks: str = "Blank",
    norm_by_barcode="Barcode",
) -> pd.DataFrame:
    """
    - raw_df: raw reader data obtained with `rda.readerfiles_rawdf()`
    - input_df: input specifications table with required columns:
        - Dataset (with specified references as their own dataset 'Reference')
        - ID (substance_id) (with specified blanks and negative_controls)
        - Assay Transfer Barcode
        - Row_384 (or Row_96)
        - Col_384 (or Col_96)
        - Concentration
        - Replicate (specifying replicate number)
        - Organism (scientific organism name i.e. with strain)
    ---
    Processing function which merges raw reader data (raw_df)
    with input specifications table (input_df) and then
    normalizes, calculates Z-Factor per plate (norm_by_barcode)
    and rounds to sensible decimal places.
    """
    # merging reader data and input specifications table
    # df = pd.merge(raw_df, input_df, how="outer")
    # df = df.groupby(norm_by_barcode)[df.columns].apply(
    #     lambda plate_grp: add_b_score(
    #         plate_grp[plate_grp[""]],
    #         # measurement_header="Raw Optical Density"
    #     )
    # )
    df[substance_id] = df[substance_id].astype(str)
    df = (
        df.groupby(norm_by_barcode)[df.columns]
        .apply(
            lambda grp: background_normalize_zfactor(
                grp,
                substance_id,
                measurement,
                negative_controls,
                blanks,
                norm_by_barcode,
            )
        )
        .reset_index(drop=True)
    )

    # df[substance_id] = df[substance_id].astype(str)

    # detect and report NA values (defined in input, not in raw data)
    orgs_w_missing_data = df[df[f"Raw {measurement}"].isna()]["Organism formatted"].unique()
    if orgs_w_missing_data.size > 0:
        print(
            f"""Processed data:
      Organisms with missing data, excluded from processed data: {orgs_w_missing_data}.
      If this is not intended, please check the Input.xlsx or if raw data files are complete.
              """
        )
        df = df.dropna(subset=[f"Raw {measurement}"])
    # Report missing
    # Remove missing from "processed" dataframe
    return df.round(
        {
            "Denoised Optical Density": 2,
            "Relative Optical Density": 2,
            "Z-Factor": 2,
            "Robust Z-Factor": 2,
            # "Concentration": 2,
        }
    )

`primary_results(df, substance_id, filepath='../data/results/', thresholds=[50])` ¶

Expects the results from rda.preprocess() function.

Source code in rda_toolbox/process.py

def primary_results(
    df: pd.DataFrame,
    substance_id,
    filepath="../data/results/",
    thresholds: list[float] = [50],
):
    """
    Expects the results from rda.preprocess() function.
    """
    df = df[
        (df["Dataset"] != "Reference")
        & (df["Dataset"] != "Positive Control")
        & (df["Dataset"] != "Blank")
    ].dropna(subset=["Concentration"])

    pivot_df = pd.pivot_table(
        df,
        values=["Relative Optical Density", "Replicate", "Z-Factor"],
        index=[
            substance_id,
            "Organism",
            "Concentration",
            "Dataset",
        ],
        aggfunc={
            "Relative Optical Density": ["mean"],
            "Replicate": ["count"],
        },
    ).reset_index()
    pivot_df.columns = [" ".join(x).strip() for x in pivot_df.columns.ravel()]

    for threshold in thresholds:
        pivot_df[f"Relative Growth < {threshold}"] = pivot_df.groupby(
            [substance_id, "Organism", "Dataset"]
        )["Relative Optical Density mean"].transform(lambda x: x < threshold)

        for dataset, dataset_grp in pivot_df.groupby(["Dataset"]):
            dataset = dataset[0]
            resultpath = os.path.join(filepath, dataset)
            pathlib.Path(resultpath).mkdir(parents=True, exist_ok=True)

            print(
                "Saving",
                os.path.join(resultpath, f"{dataset}_all_results.xlsx"),
            )
            dataset_grp.to_excel(
                os.path.join(resultpath, f"{dataset}_all_results.xlsx"),
                index=False,
            )
            print(
                "Saving",
                os.path.join(resultpath, f"{dataset}_all_results.csv"),
            )
            dataset_grp.to_csv(
                os.path.join(resultpath, f"{dataset}_all_results.csv"),
                index=False,
            )

            pivot_multiindex_df = pd.pivot_table(
                dataset_grp,
                values=[f"Relative Optical Density mean"],
                index=[substance_id, "Dataset", "Concentration"],
                columns="Organism",
            ).reset_index()
            cols = list(pivot_multiindex_df.columns.droplevel())
            cols[:3] = list(map(lambda x: x[0], pivot_multiindex_df.columns[:3]))
            pivot_multiindex_df.columns = cols

            # Apply threshold (active in any organism)
            thresholded_pivot = pivot_multiindex_df.iloc[
                list(
                    pivot_multiindex_df.iloc[:, 3:].apply(
                        lambda x: any(list(map(lambda i: i < threshold, x))), axis=1
                    )
                )
            ]

            # Sort by columns each organism after the other
            # return pivot_multiindex_df.sort_values(by=cols[3:])

            # Sort rows by mean between the organisms (lowest mean activity first)
            results_sorted_by_mean_activity = thresholded_pivot.iloc[
                thresholded_pivot.iloc[:, 3:].mean(axis=1).argsort()
            ]
            print(
                "Saving",
                os.path.join(
                    resultpath, f"{dataset}_threshold{threshold}_results.xlsx"
                ),
            )
            results_sorted_by_mean_activity.to_excel(
                os.path.join(
                    resultpath, f"{dataset}_threshold{threshold}_results.xlsx"
                ),
                index=False,
            )
            print(
                "Saving",
                os.path.join(resultpath, f"{dataset}_threshold{threshold}_results.csv"),
            )
            results_sorted_by_mean_activity.to_csv(
                os.path.join(resultpath, f"{dataset}_threshold{threshold}_results.csv"),
                index=False,
            )

`references_mic_results(preprocessed_data, resultpath, thresholds=[20, 50])` ¶

This function saves an excel file for the reference substances. Since reference substances have duplicate Internal IDs (since they are used multiple times), they would be meaned between duplicates. To circumvent this, this function exists which gets the MIC for each reference per (AcD) plate instead of per Internal ID.

Source code in rda_toolbox/process.py

def references_mic_results(
    preprocessed_data,
    resultpath,
    thresholds=[20, 50],
):
    """
    This function saves an excel file for the reference substances.
    Since reference substances have duplicate Internal IDs
    (since they are used multiple times), they would be meaned between duplicates.
    To circumvent this, this function exists which gets the MIC
    for each reference **per (AcD) plate** instead of per Internal ID.
    """
    only_references = preprocessed_data[preprocessed_data["Dataset"] == "Reference"]
    mic_records = []
    for group_names, grp in only_references.groupby(
        [
            "Internal ID",
            "External ID",
            "Organism",
            "Dataset",
            "AcD Barcode 384",
        ]
    ):
        internal_id, external_id, organism, dataset, acd_barcode = group_names
        grp = grp.copy().sort_values(by=["Concentration"])
        record = {
            "Internal ID": internal_id,
            "External ID": external_id,
            "Organism": organism,
            "Dataset": dataset,
            "AcD Barcode 384": acd_barcode,
            "Z-Factor": list(grp["Z-Factor"])[0],
        }
        for threshold in thresholds:
            values_below_threshold = grp[grp["Relative Optical Density"] < threshold]
            # thx to jonathan - check if the OD at maximum concentration is below threshold (instead of any concentration)
            max_conc_below_threshold = list(
                grp[grp["Concentration"] == max(grp["Concentration"])][
                    "Relative Optical Density"
                ]
                < threshold
            )[0]
            if not max_conc_below_threshold:
                mic = None
            else:
                mic = values_below_threshold.iloc[0]["Concentration"]
            record[f"MIC{threshold} in µM"] = mic
        mic_records.append(record)
    mic_df = pd.DataFrame.from_records(mic_records)
    mic_df.sort_values(by=["External ID", "Organism"]).to_excel(
        os.path.join(resultpath, "References_MIC_results_eachRefID.xlsx"), index=False
    )

Processing Functions¶

add_b_score(plate_df, measurement_header='Raw Optical Density', row_header='Row_384', col_header='Col_384') ¶

background_normalize_zfactor(grp, substance_id, measurement, negative_controls, blanks, norm_by_barcode) ¶

get_thresholded_subset(df, id_column='ID', negative_controls='Negative Control', blanks='Medium', blankplate_organism='Blank', threshold=None) ¶

mic_results(df, filepath, thresholds=[20, 50]) ¶

preprocess(df, substance_id='ID', measurement='Optical Density', negative_controls='Negative Control', blanks='Blank', norm_by_barcode='Barcode') ¶

primary_results(df, substance_id, filepath='../data/results/', thresholds=[50]) ¶

references_mic_results(preprocessed_data, resultpath, thresholds=[20, 50]) ¶

`add_b_score(plate_df, measurement_header='Raw Optical Density', row_header='Row_384', col_header='Col_384')` ¶

`background_normalize_zfactor(grp, substance_id, measurement, negative_controls, blanks, norm_by_barcode)` ¶

`get_thresholded_subset(df, id_column='ID', negative_controls='Negative Control', blanks='Medium', blankplate_organism='Blank', threshold=None)` ¶

`mic_results(df, filepath, thresholds=[20, 50])` ¶

`preprocess(df, substance_id='ID', measurement='Optical Density', negative_controls='Negative Control', blanks='Blank', norm_by_barcode='Barcode')` ¶

`primary_results(df, substance_id, filepath='../data/results/', thresholds=[50])` ¶

`references_mic_results(preprocessed_data, resultpath, thresholds=[20, 50])` ¶