Skip to content

fastcldf

Quickly create CLDF datasets.

Tests Versions PyPI License

Main fastcldf module

create_cldf(tables, sources, spec=None, metadata=None, columns=None, foreignkeys=None, cldf_tables=None, validate=True)

Creates a CLDF dataset.

Parameters:

Name Type Description Default
tables dict

A dict linking table names ("languages" etc.) to lists of records ([{"id": "lg-1", "name": "Language 1"} etc.]).

required
sources str

A path to a .bib file

required
metadata

A dict containing metadata about the dataset.

None
spec dict

A dict representing a cldfbench spec

None

Returns:

Type Description
dataset

A pycldf dataset, see here for details

Source code in src/fastcldf/__init__.py
def create_cldf(
    tables,
    sources,
    spec=None,
    metadata=None,
    columns=None,
    foreignkeys=None,
    cldf_tables=None,
    validate=True,
):
    """Creates a CLDF dataset.

    Parameters
    ----------
    tables : dict
      A dict linking table names ("languages" etc.) to
      lists of records ([{"id": "lg-1", "name": "Language 1"} etc.]).
    sources : str
      A path to a .bib file
    metadata: dict
      A dict containing metadata about the dataset.
    spec : dict
      A dict representing a [cldfbench](https://github.com/cldf/cldfbench) spec
    Returns
    -------
    pycldf.dataset
        A pycldf dataset, see
        [here](https://pycldf.readthedocs.io/en/latest/dataset.html)
        for details
    """
    metadata = metadata or {}
    spec = spec or {
        "dir": "./cldf",
        "module": "Generic",
        "metadata_fname": "metadata.json",
    }
    columns = columns or {}
    foreignkeys = foreignkeys or {}
    cldf_tables = cldf_tables or []
    with CLDFWriter(CLDFSpec(**spec)) as writer:
        for component in cldf_tables:
            writer.cldf.add_component(component)
        component_names, component_data, cldf_col_data = load_cldf_data()
        for table, data in tables.items():
            df = pd.DataFrame.from_dict(data).fillna("")
            if table in component_names:
                url, df = process_native_table(
                    table,
                    df,
                    component_names,
                    cldf_col_data,
                    writer,
                    user_columns=columns.get(table, {}),
                    foreignkeys=foreignkeys.get(table, {}),
                )
            else:
                url, df = process_nonnative_table(
                    table,
                    df,
                    cldf_col_data,
                    writer,
                    user_columns=columns.get(table, {}),
                    foreignkeys=foreignkeys.get(table, {}),
                )
            for rec in df.to_dict("records"):
                writer.objects[url].append(rec)

        source_path = Path(sources)
        sources = None
        if source_path.is_file():
            sources = pybtex.database.parse_file(source_path)
            writer.cldf.add_sources(
                *[Source.from_entry(k, e) for k, e in sources.entries.items()]
            )
        writer.cldf.write()
        ds = writer.cldf
    if validate:
        ds.validate()
    return ds

load_cldf(metadata_file)

Load data from a CLDF dataset

Parameters
metadata_file : str
    A path to a `.json` metadata file.
Returns
data : dict
    A dict where
  • e.g. "examples.csv" contains the example table records (list)
  • "metadata" contains the metadata (dict)
  • "sources" contains the bibfile (str)
Source code in src/fastcldf/__init__.py
def load_cldf(metadata_file):
    """Load data from a CLDF dataset

        Parameters
        -----------
        metadata_file : str
            A path to a `.json` metadata file.

        Returns
        -------
        data : dict
            A dict where
    * e.g. `"examples.csv"` contains the example table records (list)
    * `"metadata"` contains the metadata (dict)
    * `"sources"` contains the bibfile (str)
    """
    ds = Dataset.from_metadata(metadata_file)
    data = {}
    for table in ds.tables:
        res = []
        for rec in ds.iter_rows(table.url):
            res.append(rec)
        data[str(table.url)] = res
    data["metadata"] = ds.metadata_dict
    data["sources"] = load(ds.bibpath)
    return data