Skip to content

Python API

Top-level package for unboxer.

extract_corpus(filenames=None, conf=None, lexicon=None, output_dir='.', cldf=False, audio=None, skip_empty_obj=False, complain=False, segments=None, inflection=None, include=None, parsing=None, languages=None)

Extract text records from a corpus.

Parameters:

Name Type Description Default
database_file str

The path to the corpus database file.

required
conf dict

Configuration (see) todo: insert link

None
cldf bool

Should a CLDF dataset be created? Defaults to False.

False
Source code in src/unboxer/__init__.py
def extract_corpus(
    filenames=None,
    conf=None,
    lexicon=None,
    output_dir=".",
    cldf=False,
    audio=None,
    skip_empty_obj=False,
    complain=False,
    segments=None,
    inflection=None,
    include=None,
    parsing=None,
    languages=None,
):
    """Extract text records from a corpus.

    Args:
        database_file (str): The path to the corpus database file.
        conf (dict): Configuration (see) todo: insert link
        cldf (bool, optional): Should a CLDF dataset be created? Defaults to `False`.
    """
    file_recs = {}
    inflection = inflection or {}
    output_dir.mkdir(exist_ok=True, parents=True)
    for filename in filenames:
        database_file = Path(filename)
        record_marker = "\\" + conf["record_marker"]
        sep = conf["cell_separator"]

        try:
            with open(database_file, "r", encoding=conf["encoding"]) as f:
                content = f.read()
        except UnicodeDecodeError:
            log.error(
                f"""Could not open the file with the encoding [{conf["encoding"]}].
    Make sure that you are not parsing a shoebox project as toolbox or vice versa.
    You can also explicitly set the correct file encoding in your config."""
            )
            sys.exit()
        file_recs[filename] = []
        records = content.split(record_marker + " ")
        for record in records[1::]:
            res = _get_fields(
                record_marker + " " + record, record_marker, multiple=[], sep=sep
            )
            if res:
                file_recs[filename].append(res)
            else:
                pass
                # log.warning("Empty record:")
                # log.warning(record)
    dfs = {x: pd.DataFrame.from_dict(y) for x, y in file_recs.items()}
    all_texts = []
    for fn, df in dfs.items():
        log.info(f"Processing {fn}")
        if conf["text_mode"] != "none":
            text_path = output_dir / f"{fn.stem}_texts.csv"
            if text_path.is_file():
                texts = load(text_path)
            else:
                texts = []

        if record_marker in df and conf.get("slugify", True):
            if conf["interlinear_mappings"].get(record_marker, "") == "ID":
                conf["interlinear_mappings"].pop(record_marker)
            tqdm.pandas(desc="Creating record IDs")
            df["ID"] = df[record_marker].progress_apply(
                lambda x: humidify(x, "sentence_id", unique=True)
            )
        else:
            df["ID"] = df.index
        df["filename"] = fn.name

        if conf["text_mode"] == "record_marker":
            tmap_file = output_dir / f"{fn.stem}_textmap.yaml"
            if tmap_file.is_file():
                text_map = load(tmap_file)
            elif "ID" in df.columns:
                text_map = guess_texts(list(df["ID"]), fn)
                dump(text_map, tmap_file)
                log.info(
                    f"Created tentative record-text mapping in {tmap_file.resolve()}"
                )
            else:
                text_map = {}
            if isinstance(texts, list):
                texts.extend(text_map.keys())
                texts = pd.DataFrame(texts)
                texts.columns = ["ID"]
                for addcol in ["Name", "Description", "Comment", "Source", "Type"]:
                    texts[addcol] = ""
                dump(texts, text_path)
            reverse_map = {}
            for text_id, recs in text_map.items():
                for rec in recs:
                    reverse_map[rec] = text_id
            df["Text_ID"] = df["ID"].map(reverse_map).fillna("")
            all_texts.append(texts)
    df = pd.concat(dfs.values())
    if not df[record_marker].is_unique:
        if complain:
            log.warning("Found duplicate IDs, will only keep first of each:")
            dupes = df[df.duplicated(record_marker)]
            print(dupes)
        df.drop_duplicates(record_marker, inplace=True)
    df.rename(columns=conf["interlinear_mappings"], inplace=True)
    if "Analyzed_Word" not in df.columns:
        raise ValueError("Did not find Analyzed_Word:", conf["interlinear_mappings"])
    if conf["skip_empty_obj"]:
        old = len(df)
        df = df[df["Gloss"] != ""]
        log.info(f"Dropped {old-len(df)} unparsed records.")
    df.fillna("", inplace=True)
    df = df[df["Primary_Text"] != ""]

    if lexicon:
        lex_df = extract_lexicon(
            lexicon, parsing=parsing, conf=conf, output_dir=output_dir
        )
        morphemes, morphs = extract_morphs(lex_df, sep)
        morphinder = Morphinder(morphs, complain=complain)
    else:
        tdf = df.copy()
        morphs = {}
        for c in ["Analyzed_Word", "Gloss"]:
            tdf[c] = df[c].apply(lambda x: re.sub(r"-\s+", "-INTERN", x))
            tdf[c] = df[c].apply(lambda x: re.sub(r"\s+-", "INTERN-", x))
            tdf[c] = df[c].apply(lambda x: re.split(r"\s+", x))
        for rec in tdf.to_dict("records"):
            for obj, gloss in zip(rec["Analyzed_Word"], rec["Gloss"]):
                if obj == "":
                    continue
                morph_id = humidify(obj + "-" + gloss, key="pairs")
                if morph_id not in morphs:
                    morphs[morph_id] = {
                        "ID": morph_id,
                        "Form": obj,
                        "Meaning": gloss.strip("-").strip("="),
                    }
        morphs = pd.DataFrame.from_dict(morphs.values())
        morphinder = Morphinder(morphs, complain=complain)
    (
        wordforms,
        form_meanings,
        sentence_slices,
        morph_slices,
        inflections,
        stems,
        wordformstems,
        stemparts,
    ) = build_slices(df, morphinder, **inflection)
    morph_meanings = {}
    stem_meanings = {}
    for meanings in tqdm(morphs["Meaning"], desc="Morphs"):
        for meaning in meanings.split("; "):
            morph_meanings.setdefault(
                meaning, {"ID": humidify(meaning, key="meanings"), "Name": meaning}
            )

    if len(stems) > 0:
        for stem_gloss in tqdm(stems["Meaning"], desc="Stems"):
            stem_meanings.setdefault(
                stem_gloss,
                {
                    "ID": humidify(stem_gloss, key="meanings"),
                    "Name": stem_gloss,
                },
            )
    if include:
        include = load(include)
        rec_list = include
    else:
        rec_list = list(df["ID"])
    df = df[df["ID"].isin(rec_list)]

    if conf["text_mode"] != "none":
        texts = pd.concat(all_texts)
        texts = texts[texts["ID"].isin(list(df["Text_ID"]))]

    sentence_slices = sentence_slices[sentence_slices["Example_ID"].isin(rec_list)]
    for col in tqdm(df.columns, desc="Columns"):
        if col in conf["aligned_fields"]:
            df[col] = df[col].apply(_remove_spaces)
    df = df.apply(helpers.fix_glosses, axis=1)
    sentence_slices = sentence_slices[sentence_slices["Example_ID"].isin(rec_list)]
    if conf["fix_clitics"]:
        log.info("Fixing clitics")
        for col in conf["aligned_fields"]:
            df[col] = df[col].apply(_fix_clitics)
    if "Primary_Text" in df.columns:
        df["Primary_Text"] = df["Primary_Text"].apply(lambda x: re.sub(r"\s+", " ", x))

    if len(wordforms) > 0:
        wordforms = wordforms[wordforms["Form"] != ""]

    for x in [df, wordforms, morphs]:
        x["Language_ID"] = conf.get("lang_id", "undefined")
    if lexicon:
        morphemes["Language_ID"] = conf.get("lang_id", "undefined")
    if not morphs["ID"].is_unique:
        log.warning("Duplicate IDs in morph table, only keeping first instances:")
        log.warning(morphs[morphs.duplicated(subset="ID", keep=False)])
        morphs.drop_duplicates(subset="ID", inplace=True)
    if output_dir:
        df.to_csv(
            (Path(output_dir) / database_file.name).with_suffix(".csv"), index=False
        )
        morphs.to_csv((Path(output_dir) / "morphs.csv"), index=False)
        if lexicon:
            morphemes.to_csv((Path(output_dir) / "morphemes.csv"), index=False)
    if cldf:
        tables = {"examples.csv": df}
        tables["exampleparts.csv"] = sentence_slices
        if lexicon:
            morphemes["Name"] = morphemes["Headword"]
            morphemes["Description"] = morphemes["Meaning"]
            morphemes["Parameter_ID"] = morphemes["Meaning"].apply(
                lambda x: [morph_meanings[y]["ID"] for y in x.split("; ")]
            )
        if inflection:
            stems["Parameter_ID"] = stems["Meaning"].apply(
                lambda x: [stem_meanings[x]["ID"]]
            )

        if audio:
            tables["media.to_csv"] = pd.DataFrame.from_dict(
                [
                    {
                        "ID": f.stem,
                        "Media_Type": "audio/" + f.suffix.strip("."),
                        "Download_URL": str(f),
                    }
                    for f in audio.iterdir()
                ]
            )

        morphs["Name"] = morphs["Form"]
        if segments:
            extra = ["+", "-", "(", ")", "/", "∅", "0", "?", ",", "=", ";"]
            pdf = load(segments)
            tokenizer = Tokenizer(
                Profile(
                    *(
                        pdf.to_dict("records")
                        + [{"Grapheme": x, "IPA": x} for x in extra]
                    )
                )
            )
            log.info("Tokenizing...")
            tokenize = lambda x: tokenizer(x.lower().replace("-", ""), column="IPA")
            for m_df in [wordforms, morphs]:
                if len(m_df) > 0:
                    for orig, repl in conf.get("replace", {}).items():
                        m_df["Form"] = m_df["Form"].replace(orig, repl, regex=True)
                    m_df["Segments"] = m_df["Form"].apply(
                        lambda x: tokenize(x).split(" ")
                    )
                    bad = m_df[m_df["Segments"].apply(lambda x: "�" in x)]
                    if len(bad) > 1:
                        log.warning(f"Unsegmentable: <{bad}>")
                        m_df["Segments"] = m_df["Segments"].apply(
                            lambda x: "" if "�" in x else x
                        )
        if len(morph_slices) > 0:
            morph_slices["Gloss_ID"] = morph_slices["Gloss"].apply(id_glosses)
            tables["glosses.csv"] = pd.DataFrame.from_dict(
                [{"ID": v, "Name": k} for k, v in get_values("glosses").items()]
            )
        morphs["Description"] = morphs["Meaning"]
        morphs["Parameter_ID"] = morphs["Description"].apply(
            lambda x: [morph_meanings[y]["ID"] for y in x.split("; ")]
        )
        if len(form_meanings) > 0:
            morph_meanings = pd.DataFrame.from_dict(
                [
                    x
                    for x in morph_meanings.values()
                    if x["ID"] not in list(form_meanings["ID"])
                ]
            )
            stem_meanings = pd.DataFrame.from_dict(
                [
                    x
                    for x in stem_meanings.values()
                    if x["ID"] not in list(form_meanings["ID"])
                ]
            )
            tables["parameters.csv"] = pd.concat(
                [form_meanings, morph_meanings, stem_meanings]
            )
        else:
            morph_meanings = pd.DataFrame.from_dict(morph_meanings.values())
            tables["parameters.csv"] = morph_meanings
        if len(wordforms) > 0:
            tables["wordforms.csv"] = wordforms
        tables["morphs.csv"] = morphs
        tables["wordformparts.csv"] = morph_slices
        if len(stems) > 0:
            stems["Language_ID"] = conf.get("lang_id", "undefined")
            stems["Lexeme_ID"] = stems["ID"]
            tables["stems.csv"] = stems
            tables["lexemes.csv"] = stems
            tables["stemparts.csv"] = stemparts
            tables["wordformstems.csv"] = wordformstems
            tables["inflections.csv"] = inflections
            tables["inflectionalcategories.csv"] = inflection["infl_cats"]
            tables["inflectionalvalues.csv"] = inflection["infl_vals"]
        if conf["text_mode"] != "none" and len(texts) > 0 and len(df) > 0:
            tables["texts.csv"] = texts
        if lexicon:
            lexicon, meanings = get_lexical_data(lex_df)
            tables["morphemes.csv"] = morphemes
            tables["parameters.csv"] = pd.concat([meanings, tables["parameters.csv"]])
            tables["parameters.csv"].drop_duplicates(subset="ID", inplace=True)
        create_cldf(
            tables=tables,
            conf=conf,
            output_dir=output_dir,
            cldf_name=conf.get("cldf_name", "cldf"),
            languages=languages,
            module="corpus",
        )
    return df