def extract_corpus(
filenames=None,
conf=None,
lexicon=None,
output_dir=".",
cldf=False,
audio=None,
skip_empty_obj=False,
complain=False,
segments=None,
inflection=None,
include=None,
parsing=None,
languages=None,
):
"""Extract text records from a corpus.
Args:
database_file (str): The path to the corpus database file.
conf (dict): Configuration (see) todo: insert link
cldf (bool, optional): Should a CLDF dataset be created? Defaults to `False`.
"""
file_recs = {}
inflection = inflection or {}
output_dir.mkdir(exist_ok=True, parents=True)
for filename in filenames:
database_file = Path(filename)
record_marker = "\\" + conf["record_marker"]
sep = conf["cell_separator"]
try:
with open(database_file, "r", encoding=conf["encoding"]) as f:
content = f.read()
except UnicodeDecodeError:
log.error(
f"""Could not open the file with the encoding [{conf["encoding"]}].
Make sure that you are not parsing a shoebox project as toolbox or vice versa.
You can also explicitly set the correct file encoding in your config."""
)
sys.exit()
file_recs[filename] = []
records = content.split(record_marker + " ")
for record in records[1::]:
res = _get_fields(
record_marker + " " + record, record_marker, multiple=[], sep=sep
)
if res:
file_recs[filename].append(res)
else:
pass
# log.warning("Empty record:")
# log.warning(record)
dfs = {x: pd.DataFrame.from_dict(y) for x, y in file_recs.items()}
all_texts = []
for fn, df in dfs.items():
log.info(f"Processing {fn}")
if conf["text_mode"] != "none":
text_path = output_dir / f"{fn.stem}_texts.csv"
if text_path.is_file():
texts = load(text_path)
else:
texts = []
if record_marker in df and conf.get("slugify", True):
if conf["interlinear_mappings"].get(record_marker, "") == "ID":
conf["interlinear_mappings"].pop(record_marker)
tqdm.pandas(desc="Creating record IDs")
df["ID"] = df[record_marker].progress_apply(
lambda x: humidify(x, "sentence_id", unique=True)
)
else:
df["ID"] = df.index
df["filename"] = fn.name
if conf["text_mode"] == "record_marker":
tmap_file = output_dir / f"{fn.stem}_textmap.yaml"
if tmap_file.is_file():
text_map = load(tmap_file)
elif "ID" in df.columns:
text_map = guess_texts(list(df["ID"]), fn)
dump(text_map, tmap_file)
log.info(
f"Created tentative record-text mapping in {tmap_file.resolve()}"
)
else:
text_map = {}
if isinstance(texts, list):
texts.extend(text_map.keys())
texts = pd.DataFrame(texts)
texts.columns = ["ID"]
for addcol in ["Name", "Description", "Comment", "Source", "Type"]:
texts[addcol] = ""
dump(texts, text_path)
reverse_map = {}
for text_id, recs in text_map.items():
for rec in recs:
reverse_map[rec] = text_id
df["Text_ID"] = df["ID"].map(reverse_map).fillna("")
all_texts.append(texts)
df = pd.concat(dfs.values())
if not df[record_marker].is_unique:
if complain:
log.warning("Found duplicate IDs, will only keep first of each:")
dupes = df[df.duplicated(record_marker)]
print(dupes)
df.drop_duplicates(record_marker, inplace=True)
df.rename(columns=conf["interlinear_mappings"], inplace=True)
if "Analyzed_Word" not in df.columns:
raise ValueError("Did not find Analyzed_Word:", conf["interlinear_mappings"])
if conf["skip_empty_obj"]:
old = len(df)
df = df[df["Gloss"] != ""]
log.info(f"Dropped {old-len(df)} unparsed records.")
df.fillna("", inplace=True)
df = df[df["Primary_Text"] != ""]
if lexicon:
lex_df = extract_lexicon(
lexicon, parsing=parsing, conf=conf, output_dir=output_dir
)
morphemes, morphs = extract_morphs(lex_df, sep)
morphinder = Morphinder(morphs, complain=complain)
else:
tdf = df.copy()
morphs = {}
for c in ["Analyzed_Word", "Gloss"]:
tdf[c] = df[c].apply(lambda x: re.sub(r"-\s+", "-INTERN", x))
tdf[c] = df[c].apply(lambda x: re.sub(r"\s+-", "INTERN-", x))
tdf[c] = df[c].apply(lambda x: re.split(r"\s+", x))
for rec in tdf.to_dict("records"):
for obj, gloss in zip(rec["Analyzed_Word"], rec["Gloss"]):
if obj == "":
continue
morph_id = humidify(obj + "-" + gloss, key="pairs")
if morph_id not in morphs:
morphs[morph_id] = {
"ID": morph_id,
"Form": obj,
"Meaning": gloss.strip("-").strip("="),
}
morphs = pd.DataFrame.from_dict(morphs.values())
morphinder = Morphinder(morphs, complain=complain)
(
wordforms,
form_meanings,
sentence_slices,
morph_slices,
inflections,
stems,
wordformstems,
stemparts,
) = build_slices(df, morphinder, **inflection)
morph_meanings = {}
stem_meanings = {}
for meanings in tqdm(morphs["Meaning"], desc="Morphs"):
for meaning in meanings.split("; "):
morph_meanings.setdefault(
meaning, {"ID": humidify(meaning, key="meanings"), "Name": meaning}
)
if len(stems) > 0:
for stem_gloss in tqdm(stems["Meaning"], desc="Stems"):
stem_meanings.setdefault(
stem_gloss,
{
"ID": humidify(stem_gloss, key="meanings"),
"Name": stem_gloss,
},
)
if include:
include = load(include)
rec_list = include
else:
rec_list = list(df["ID"])
df = df[df["ID"].isin(rec_list)]
if conf["text_mode"] != "none":
texts = pd.concat(all_texts)
texts = texts[texts["ID"].isin(list(df["Text_ID"]))]
sentence_slices = sentence_slices[sentence_slices["Example_ID"].isin(rec_list)]
for col in tqdm(df.columns, desc="Columns"):
if col in conf["aligned_fields"]:
df[col] = df[col].apply(_remove_spaces)
df = df.apply(helpers.fix_glosses, axis=1)
sentence_slices = sentence_slices[sentence_slices["Example_ID"].isin(rec_list)]
if conf["fix_clitics"]:
log.info("Fixing clitics")
for col in conf["aligned_fields"]:
df[col] = df[col].apply(_fix_clitics)
if "Primary_Text" in df.columns:
df["Primary_Text"] = df["Primary_Text"].apply(lambda x: re.sub(r"\s+", " ", x))
if len(wordforms) > 0:
wordforms = wordforms[wordforms["Form"] != ""]
for x in [df, wordforms, morphs]:
x["Language_ID"] = conf.get("lang_id", "undefined")
if lexicon:
morphemes["Language_ID"] = conf.get("lang_id", "undefined")
if not morphs["ID"].is_unique:
log.warning("Duplicate IDs in morph table, only keeping first instances:")
log.warning(morphs[morphs.duplicated(subset="ID", keep=False)])
morphs.drop_duplicates(subset="ID", inplace=True)
if output_dir:
df.to_csv(
(Path(output_dir) / database_file.name).with_suffix(".csv"), index=False
)
morphs.to_csv((Path(output_dir) / "morphs.csv"), index=False)
if lexicon:
morphemes.to_csv((Path(output_dir) / "morphemes.csv"), index=False)
if cldf:
tables = {"examples.csv": df}
tables["exampleparts.csv"] = sentence_slices
if lexicon:
morphemes["Name"] = morphemes["Headword"]
morphemes["Description"] = morphemes["Meaning"]
morphemes["Parameter_ID"] = morphemes["Meaning"].apply(
lambda x: [morph_meanings[y]["ID"] for y in x.split("; ")]
)
if inflection:
stems["Parameter_ID"] = stems["Meaning"].apply(
lambda x: [stem_meanings[x]["ID"]]
)
if audio:
tables["media.to_csv"] = pd.DataFrame.from_dict(
[
{
"ID": f.stem,
"Media_Type": "audio/" + f.suffix.strip("."),
"Download_URL": str(f),
}
for f in audio.iterdir()
]
)
morphs["Name"] = morphs["Form"]
if segments:
extra = ["+", "-", "(", ")", "/", "∅", "0", "?", ",", "=", ";"]
pdf = load(segments)
tokenizer = Tokenizer(
Profile(
*(
pdf.to_dict("records")
+ [{"Grapheme": x, "IPA": x} for x in extra]
)
)
)
log.info("Tokenizing...")
tokenize = lambda x: tokenizer(x.lower().replace("-", ""), column="IPA")
for m_df in [wordforms, morphs]:
if len(m_df) > 0:
for orig, repl in conf.get("replace", {}).items():
m_df["Form"] = m_df["Form"].replace(orig, repl, regex=True)
m_df["Segments"] = m_df["Form"].apply(
lambda x: tokenize(x).split(" ")
)
bad = m_df[m_df["Segments"].apply(lambda x: "�" in x)]
if len(bad) > 1:
log.warning(f"Unsegmentable: <{bad}>")
m_df["Segments"] = m_df["Segments"].apply(
lambda x: "" if "�" in x else x
)
if len(morph_slices) > 0:
morph_slices["Gloss_ID"] = morph_slices["Gloss"].apply(id_glosses)
tables["glosses.csv"] = pd.DataFrame.from_dict(
[{"ID": v, "Name": k} for k, v in get_values("glosses").items()]
)
morphs["Description"] = morphs["Meaning"]
morphs["Parameter_ID"] = morphs["Description"].apply(
lambda x: [morph_meanings[y]["ID"] for y in x.split("; ")]
)
if len(form_meanings) > 0:
morph_meanings = pd.DataFrame.from_dict(
[
x
for x in morph_meanings.values()
if x["ID"] not in list(form_meanings["ID"])
]
)
stem_meanings = pd.DataFrame.from_dict(
[
x
for x in stem_meanings.values()
if x["ID"] not in list(form_meanings["ID"])
]
)
tables["parameters.csv"] = pd.concat(
[form_meanings, morph_meanings, stem_meanings]
)
else:
morph_meanings = pd.DataFrame.from_dict(morph_meanings.values())
tables["parameters.csv"] = morph_meanings
if len(wordforms) > 0:
tables["wordforms.csv"] = wordforms
tables["morphs.csv"] = morphs
tables["wordformparts.csv"] = morph_slices
if len(stems) > 0:
stems["Language_ID"] = conf.get("lang_id", "undefined")
stems["Lexeme_ID"] = stems["ID"]
tables["stems.csv"] = stems
tables["lexemes.csv"] = stems
tables["stemparts.csv"] = stemparts
tables["wordformstems.csv"] = wordformstems
tables["inflections.csv"] = inflections
tables["inflectionalcategories.csv"] = inflection["infl_cats"]
tables["inflectionalvalues.csv"] = inflection["infl_vals"]
if conf["text_mode"] != "none" and len(texts) > 0 and len(df) > 0:
tables["texts.csv"] = texts
if lexicon:
lexicon, meanings = get_lexical_data(lex_df)
tables["morphemes.csv"] = morphemes
tables["parameters.csv"] = pd.concat([meanings, tables["parameters.csv"]])
tables["parameters.csv"].drop_duplicates(subset="ID", inplace=True)
create_cldf(
tables=tables,
conf=conf,
output_dir=output_dir,
cldf_name=conf.get("cldf_name", "cldf"),
languages=languages,
module="corpus",
)
return df