Skip to content

Parsing Functions

collect_metadata(filedicts)

Helperfunction to collect the metadata from all reader files into a dataframe.

Source code in rda_toolbox/parser.py
def collect_metadata(filedicts: list[dict]) -> pd.DataFrame:
    """
    Helperfunction to collect the metadata from all reader files into a dataframe.
    """
    allmetadata_df = pd.DataFrame()
    for filedict in filedicts:
        meta_df = pd.DataFrame(filedict["metadata"], index=[0])
        meta_df["Barcode"] = filedict["Barcode"]
        allmetadata_df = pd.concat([allmetadata_df, meta_df], ignore_index=True)
    return allmetadata_df

collect_results(filedicts)

Collect and merge results from the readerfiles.

Source code in rda_toolbox/parser.py
def collect_results(filedicts: list[dict]) -> pd.DataFrame:
    """
    Collect and merge results from the readerfiles.
    """
    allresults_df = pd.DataFrame(
        {"Row": [], "Column": [], "Raw Optical Density": []}
    )  # , "Layout": [], "Concentration": []})
    platetype_s = list(set(fd["plate_type"] for fd in filedicts))
    if len(platetype_s) == 1:
        platetype = platetype_s[0]
    else:
        raise Exception(f"Different plate types used {platetype_s}")

    for filedict in filedicts:
        # long_layout_df = get_long_df("Layout")
        # long_concentrations_df = get_long_df("Concentration")
        # long_rawdata_df = get_long_df("Raw Optical Density")

        long_rawdata_df = pd.melt(
            filedict["Raw Optical Density"].reset_index(names="Row"),
            id_vars=["Row"],
            var_name="Column",
            value_name="Raw Optical Density",
        )

        long_rawdata_df["Barcode"] = filedict["Barcode"]
        # df_merged = reduce(
        #     lambda  left,right: pd.merge(left,right,on=['Row', 'Column'], how='outer'),
        #     [long_rawdata_df, long_layout_df, long_concentrations_df]
        # )
        allresults_df = pd.concat([allresults_df, long_rawdata_df], axis=0)
        platetype = filedict["plate_type"]

    allresults_df.rename(
        columns={"Row": f"Row_{platetype}", "Column": f"Col_{platetype}"}, inplace=True
    )
    return allresults_df.reset_index(drop=True)

filepaths_to_filedicts(filepaths)

Wrapper function to obtain a list of dictionaries which contain the raw files information like

  • different entries of metadata
    • Plate Type
    • Barcode
    • Date
    • Time
    • etc.
  • Raw Optical Density (DataFrame)
  • Concentration (DataFrame)
  • Layout (DataFrame)
Source code in rda_toolbox/parser.py
def filepaths_to_filedicts(filepaths: list[str]) -> list[dict]:
    """
    Wrapper function to obtain a list of dictionaries which contain the raw files information like

    - different entries of metadata
        - Plate Type
        - Barcode
        - Date
        - Time
        - etc.
    - Raw Optical Density (DataFrame)
    - Concentration (DataFrame)
    - Layout (DataFrame)
    """
    filedicts = []
    for path in filepaths:
        file = open(path)
        contents = StringIO(file.read())
        filedicts.append(readerfile_parser(basename(path), contents))
        file.close()
    return filedicts

parse_mappingfile(filepath, motherplate_column='Origin Plate', childplate_column='AcD Barcode 384')

Simple mappingfile parser function. Expects to start with a "Motherplate" line followed by corresponding "Childplates" in a single line.

Source code in rda_toolbox/parser.py
def parse_mappingfile(
    filepath: str,
    motherplate_column: str = "Origin Plate",
    childplate_column: str = "AcD Barcode 384",
):
    """
    Simple mappingfile parser function.
    Expects to start with a "Motherplate" line followed by corresponding "Childplates" in a single line.
    """
    filedict = dict()
    with open(filepath) as file:
        filecontents = file.read().splitlines()
        key = None
        for i, line in enumerate(filecontents):
            line = line.split(";")
            if i % 2 == 0:  # if i is even (expect MPs on even lines, alternating with childplates)
            # if len(line) == 1:
                key = line[0]
            else:
                if not key:
                    raise ValueError(
                        "Motherplate barcode expected on first line."
                    )
                if key in filedict:
                    filedict[key].append(line)
                else:
                    filedict[key] = [line]
    mapping_df = pd.DataFrame(
        [
            (motherplate, childplate, rep_num, rack_nr)
            for motherplate, replicates in filedict.items()
            for rep_num, childplates in enumerate(replicates, start=1)
            for rack_nr, childplate in enumerate(childplates, start=1)
        ],
        columns=[motherplate_column, childplate_column, "Replicate", "Rack"],
    )
    return mapping_df

parse_readerfiles(path)

Reads CytationC10 readerfiles (plain text files) and merges the results into a DataFrame which is returned. Wrapper for readerfiles_rawdf to keep backwards compatibility. Improves readerfiles_rawdf, provide a single path for convenience.

Source code in rda_toolbox/parser.py
def parse_readerfiles(path: str | None) -> pd.DataFrame | None:
    """
    Reads CytationC10 readerfiles (plain text files) and merges the results into a DataFrame which is returned.
    Wrapper for readerfiles_rawdf to keep backwards compatibility.
    Improves readerfiles_rawdf, provide a single path for convenience.
    """
    if not path:
        return None
    paths = [
            os.path.join(path, f)
            for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
    ]
    df = readerfiles_rawdf(paths)
    df["Col_384"] = df["Col_384"].astype(int)
    return df

process_inputfile(file_object)

Read Input excel file which should have the following columns
  • Barcode
  • Organism
  • Row_384
  • Col_384
  • ID

Optional columns: - Concentration in mg/mL (or other units) - Cutoff

Source code in rda_toolbox/parser.py
def process_inputfile(file_object):
    """
    Read Input excel file which should have the following columns:
        - Barcode
        - Organism
        - Row_384
        - Col_384
        - ID
    Optional columns:
        - Concentration in mg/mL (or other units)
        - Cutoff
    """
    if not file_object:
        return None
    excel_file = pd.ExcelFile(file_object)
    substance_df = pd.read_excel(excel_file, "substances")
    layout_df = pd.read_excel(excel_file, "layout")
    df = pd.merge(layout_df, substance_df, how="cross")
    # df.rename(columns={
    #     "barcode": "Barcode",
    #     "replicate": "Replicate",
    #     "organism": "Organism",
    #     "plate_row": "Row_384",
    #     "plate_column": "Col_384",
    #     "id": "ID",
    #     "concentration": "Concentration in mg/mL",
    # }, inplace=True)
    df["ID"] = df["ID"].astype(str)
    return df

read_platemapping(filepath, orig_barcodes)

Reads a mappingfile generated by the barcode reader.

Source code in rda_toolbox/parser.py
def read_platemapping(filepath: str, orig_barcodes: list[str]):
    """
    Reads a mappingfile generated by the barcode reader.

    """
    filedict = dict()
    orig_barcodes = list(map(str, orig_barcodes))
    with open(filepath) as file:
        filecontents = file.read().splitlines()
        origin_barcode = ""
        origin_replicates = []
        for line in filecontents:
            line = line.split(";")
            if len(line) == 1 and line[0] in orig_barcodes:
                origin_barcode = line[0]
                origin_replicates.append(origin_barcode)
                # print("Origin barcode: ", origin_barcode)
                if origin_barcode not in filedict:
                    filedict[origin_barcode] = []
            else:
                filedict[origin_barcode].append(line)
        replicates_dict = {i:origin_replicates.count(i) for i in origin_replicates}
        if sorted(list(filedict.keys())) != sorted(orig_barcodes):
            raise ValueError(
                f"The origin barcodes from the mappingfile '{os.path.basename(filepath)}' and MP barcodes in MIC_input.xlsx do not coincide."
            )
        return filedict, replicates_dict

readerfile_parser(filename, file_object, resulttable_header='Results')

Parser for files created by the BioTek Cytation C10 Confocal Imaging Reader.

Source code in rda_toolbox/parser.py
def readerfile_parser(
    filename: str, file_object: StringIO, resulttable_header: str = "Results"
) -> dict:
    """
    Parser for files created by the BioTek Cytation C10 Confocal Imaging Reader.
    """
    lines = file_object.readlines()
    lines = list(filter(None, map(lambda x: x.strip("\n").strip("\r"), lines)))
    if len(lines) == 0:
        raise ValueError(f"Empty raw file {filename}.")

    # search the file for plate type definition and use it to derive number of rows and columns
    found_plate_type = re.findall(r"Plate Type;[A-z ]*([0-9]*)", "".join(lines))
    plate_type = 96  # define default plate type and let it be 96-well plate as this is what we started with
    if found_plate_type:
        plate_type = int(found_plate_type[0])

    num_rows, num_columns = get_rows_cols(plate_type)

    filedict = dict()
    metadata = dict()
    filedict["Reader Filename"] = filename
    filedict["plate_type"] = plate_type
    # TODO: get barcode via regex
    barcode_found = re.findall(
        r"\d{3}[A-Z][a-z]?[a-zA-Z]\d{2}\d{3}", filedict["Reader Filename"]
    )
    if not barcode_found:
        filedict["Barcode"] = filedict["Reader Filename"]
    else:
        filedict["Barcode"] = barcode_found[0]
    # filedict["Barcode"] = Path(filedict["Reader Filename"]).stem.split("_")[-1]

    results = np.empty([num_rows, num_columns], dtype=float)
    # using dtype=str results in unicode strings of length 1 ('U1'), therefore we use 'U25'
    layout = np.empty([num_rows, num_columns], dtype="U25")
    concentrations = np.empty([num_rows, num_columns], dtype=float)

    metadata_regex = r";?([a-zA-Z0-9 \/]*)[;:]+([a-zA-Z0-9 \/\\:_.-]*),?"
    line_num = 0
    while line_num < len(lines):
        if lines[line_num] == resulttable_header:
            line_num += 1
            header = map(
                int, lines[line_num].strip("\n").split(";")[1:]
            )  # get the header
            index = [""] * num_rows
            for _row_num in range(num_rows):  # for the next num_rows, read result data
                line_num += 1
                res_line = lines[line_num].split(";")
                # Split at ; and slice off rowlabel and excitation/emission value:
                index[_row_num] = res_line[0]
                results[_row_num] = res_line[1:-1]
            # Initialize DataFrame from results and add it to filedict
            filedict["Raw Optical Density"] = pd.DataFrame(
                data=results, index=index, columns=header
            )
            line_num += 1
        elif lines[line_num] == "Layout":  # For the next num_rows, read layout data
            line_num += 1
            header = list(
                map(int, lines[line_num].strip("\n").split(";")[1:])
            )  # Because we use header twice here, we collect it via list()
            index = [""] * num_rows
            for _row_num in range(num_rows):
                line_num += 1
                layout_line = lines[line_num].split(";")
                index[_row_num] = layout_line[0]
                layout[_row_num] = layout_line[1:-1]
                # Each second line yields a concentration layout line
                line_num += 1
                conc_line = lines[line_num].split(";")
                concentrations[_row_num] = [
                    None if not x else float(x) for x in conc_line[1:-1]
                ]
            # Add layouts to filedict
            filedict["Layout"] = pd.DataFrame(data=layout, index=index, columns=header)
            filedict["Concentration"] = pd.DataFrame(
                data=concentrations, index=index, columns=header
            )
            line_num += 1
        else:
            metadata_pairs = re.findall(metadata_regex, lines[line_num])
            line_num += 1
            if not metadata_pairs:
                continue
            else:
                for key, value in metadata_pairs:
                    if not all(
                        [key, value]
                    ):  # if any of the keys or values are empty, skip
                        continue
                    else:
                        metadata[key.strip(" :")] = value.strip(" ")
    filedict["metadata"] = metadata
    return filedict

readerfiles_metadf(paths)

Parses metadata from files declared by filepaths and merges the results into a DataFrame.

Source code in rda_toolbox/parser.py
def readerfiles_metadf(paths: list[str]) -> pd.DataFrame:
    """
    Parses metadata from files declared by filepaths and merges the results into a DataFrame.
    """
    filedicts = filepaths_to_filedicts(paths)
    return collect_metadata(filedicts)

readerfiles_rawdf(paths)

Parses data from files declared by filepaths and merges the results into a DataFrame :param paths: A list of filepaths corresponding to the raw reader files generated by Cytation10 :type paths: list[str] :return: A DataFrame in tidy and long format with the raw readerfile contents :rtype: pd.DataFrame

:Example:

```Python
import glob

rawdata_df = readerfiles_rawdf(glob.glob("path/to/raw/files/*"))
```
Source code in rda_toolbox/parser.py
def readerfiles_rawdf(paths: list[str]) -> pd.DataFrame:
    """Parses data from files declared by filepaths and merges the results into a DataFrame
    :param paths: A list of filepaths corresponding to the raw reader files generated by Cytation10
    :type paths: list[str]
    :return: A DataFrame in tidy and long format with the raw readerfile contents
    :rtype: pd.DataFrame

    :Example:

        ```Python
        import glob

        rawdata_df = readerfiles_rawdf(glob.glob("path/to/raw/files/*"))
        ```
    """
    filedicts = filepaths_to_filedicts(paths)
    rawdata = collect_results(filedicts)
    rawdata["Col_384"] = rawdata["Col_384"].astype(str)
    rawdata.rename(columns={"Barcode": "AcD Barcode 384"}, inplace=True)
    return rawdata