Skip to content

catalog

Catalog

Catalog(
    filename: str | PathLike,
    key_column_name: str,
    columns_types: dict,
    rename_dict: dict,
    units_dict: dict,
    catalog_type: CatalogType = 'default_catalog',
    remove_duplicates: bool = True,
    user_filepath: str | PathLike = DATA_BASE_PATH,
    separator: str = ',',
    decimal: str = '.',
)

Generic wrapper for tabular data read from a csv file, indexed by a key column.

For now, we only support csv input files located in the data/ folder of the source code.

Please note that the responsibility of ensuring the "uniqueness" of the key column is left to the user: no integrity check is performed on input data.

Parameters:

Name Type Description Default

filename

str | PathLike

filename of the csv data source

required

key_column_name

str

name of the column used as key (i.e. row identifier)

required

catalog_type

Literal['default_catalog', 'cable_catalog']

type of the catalog. Used in the get_as_object method to convert the catalog to a specific object type.

'default_catalog'

columns_types

dict | None

dictionary of column names and their types.

required

rename_dict

dict | None

dictionary of column names to rename. The key is the original name, the value is the new name.

required

remove_duplicates

bool

whether to remove duplicate rows. Defaults to True.

True

user_filepath

str | PathLike

path to the folder containing the csv file. Defaults to internal data.

DATA_BASE_PATH
Source code in src/mechaphlowers/data/catalog/catalog.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    filename: str | PathLike,
    key_column_name: str,
    columns_types: dict,
    rename_dict: dict,
    units_dict: dict,
    catalog_type: CatalogType = 'default_catalog',
    remove_duplicates: bool = True,
    user_filepath: str | PathLike = DATA_BASE_PATH,
    separator: str = ",",
    decimal: str = ".",
) -> None:
    """Initialize catalog from a csv file.

    For now, we only support csv input files located in the `data/` folder of the source code.

    Please note that the responsibility of ensuring the "uniqueness" of the `key` column is left
    to the user: no integrity check is performed on input data.

    Args:
            filename (str | PathLike): filename of the csv data source
            key_column_name (str): name of the column used as key (i.e. row identifier)
            catalog_type (Literal['default_catalog', "cable_catalog"]): type of the catalog. Used in the `get_as_object` method to convert the catalog to a specific object type.
            columns_types (dict | None): dictionary of column names and their types.
            rename_dict (dict | None): dictionary of column names to rename. The key is the original name, the value is the new name.
            remove_duplicates (bool): whether to remove duplicate rows. Defaults to True.
            user_filepath (str | PathLike): path to the folder containing the csv file. Defaults to internal data.
    """
    self.catalog_type = catalog_type
    self.units_dict = units_dict
    if user_filepath is None:
        filepath = DATA_BASE_PATH / filename  # type: ignore[operator]
    else:
        filepath = Path(user_filepath) / filename  # type: ignore[operator]
    if not filepath.exists():
        raise FileNotFoundError(
            f"{user_filepath=} seems to be not a valid path. Please provide a valid path."
        )

    # Warning: booleans are not treated correctly in order to avoid issues with empty values.
    # TODO: Maybe remove this filter if we consider that empty values on boolean columns does not exist, or fix this
    dtype_dict_without_bool = {
        key: value
        for (key, value) in columns_types.items()
        if value is not bool
    }

    # forcing key index to be a str. Key index should not be in types_dict
    dtype_dict_with_key = dtype_dict_without_bool.copy()
    dtype_dict_with_key[key_column_name] = str

    self._data = pd.read_csv(
        filepath,
        index_col=key_column_name,
        dtype=dtype_dict_with_key,
        sep=separator,
        decimal=decimal,
    )
    # validating the pandera schema. Useful for checking missing fields
    self.validate_types(dtype_dict_without_bool)
    self.rename_columns(key_column_name, rename_dict)
    if remove_duplicates is True:
        self.remove_duplicates(filename)

check_wrong_rows

check_wrong_rows(clean_catalog: bool = True) -> Set

Check if rows are causes pandera SchemaErrors when running get_as_object(), and eventually remove them.

Parameters:

Name Type Description Default

clean_catalog

bool

If True, removes invalid rows from the catalog. Defaults to True.

True

Returns:

Name Type Description
Set Set

Set of indices of rows that are invalid according to the pandera schema.

Source code in src/mechaphlowers/data/catalog/catalog.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def check_wrong_rows(self, clean_catalog: bool = True) -> Set:
    """Check if rows are causes pandera SchemaErrors when running get_as_object(), and eventually remove them.

    Args:
        clean_catalog (bool): If True, removes invalid rows from the catalog. Defaults to True.

    Returns:
        Set: Set of indices of rows that are invalid according to the pandera schema.
    """
    wrong_rows = []
    try:
        self.get_as_object(self.keys())
    except pa.errors.SchemaErrors as error:
        # get the index of the rows that caused an error, and remove the duplicates
        wrong_rows = error.failure_cases["index"].unique()
        if clean_catalog:
            self._data.drop(wrong_rows, inplace=True)
            warnings.warn(
                f"The following rows have incorrect data, and are removed from dataframe: {wrong_rows}"
            )
        else:
            warnings.warn(
                f"The following rows have incorrect data, but are NOT removed from dataframe: {wrong_rows}"
            )
    return set(wrong_rows)

get

get(keys: list | str) -> DataFrame

Get rows from a list of keys.

If a key is present several times in the keys argument, the returned dataframe will contain the corresponding row as many times as requested.

If any of the requested keys were to match several rows, all matching rows would be returned.

Raises:

Type Description
KeyError

if any of the requested keys doesn't match any row in the input data

Parameters:

Name Type Description Default

keys

list

list of keys

required

Returns:

Type Description
DataFrame

pd.DataFrame: requested rows

Source code in src/mechaphlowers/data/catalog/catalog.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def get(self, keys: list | str) -> pd.DataFrame:
    """Get rows from a list of keys.

    If a key is present several times in the `keys` argument, the returned dataframe
    will contain the corresponding row as many times as requested.

    If any of the requested `keys` were to match several rows, all matching rows would
    be returned.

    Raises:
            KeyError: if any of the requested `keys` doesn't match any row in the input data

    Args:
            keys (list): list of keys

    Returns:
            pd.DataFrame: requested rows
    """
    if isinstance(keys, str):
        keys = [keys]
    elif not isinstance(keys, list):
        raise TypeError(
            f"Expected a list or str as argument for 'keys', got {type(keys)}"
        )
    try:
        return self._data.loc[keys]
    except KeyError as e:
        raise KeyError(
            f"Error when requesting catalog: {e.args[0]}. Try the .keys() method to gets the available keys?"
        ) from e

get_as_object

get_as_object(keys: list) -> Any

Get rows from a list of keys.

If a key is present several times in the keys argument, the returned dataframe will contain the corresponding row as many times as requested.

If any of the requested keys were to match several rows, all matching rows would be returned.

The type of the object returned depends on catalog_type. The mapping between catalog_type and object type is made by dictionary catalog_to_object

Raises:

Type Description
KeyError

if any of the requested keys doesn't match any row in the input data

Parameters:

Name Type Description Default

keys

list

list of keys

required

Returns:

Name Type Description
object Any

requested object, that depends on catalog_type

Source code in src/mechaphlowers/data/catalog/catalog.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def get_as_object(self, keys: list) -> Any:
    """Get rows from a list of keys.

    If a key is present several times in the `keys` argument, the returned dataframe
    will contain the corresponding row as many times as requested.

    If any of the requested `keys` were to match several rows, all matching rows would
    be returned.

    The type of the object returned depends on catalog_type.
    The mapping between catalog_type and object type is made by dictionary catalog_to_object

    Raises:
            KeyError: if any of the requested `keys` doesn't match any row in the input data

    Args:
            keys (list): list of keys

    Returns:
            object: requested object, that depends on `catalog_type`
    """
    if (
        self.catalog_type == "default_catalog"
        or self.catalog_type not in catalog_to_object
    ):
        raise KeyError(
            f"Catalog type '{self.catalog_type}' is not supported for get_as_object(). "
            "Supported types are: "
            f"{list(catalog_to_object.keys())[1:]}"
        )
    df = self.get(keys)
    element_array = catalog_to_object[self.catalog_type](df)
    if isinstance(element_array, ElementArray):
        element_array.add_units(self.units_dict)
    return element_array

keys

keys() -> list

Get the keys available in the catalog

Source code in src/mechaphlowers/data/catalog/catalog.py
251
252
253
def keys(self) -> list:
    """Get the keys available in the catalog"""
    return self._data.index.tolist()

remove_duplicates

remove_duplicates(filename: str | PathLike) -> None

Remove duplicate rows, and warn if any duplicates found.

Parameters:

Name Type Description Default

filename

str | PathLike

filename of the csv data source (used only for logging a warning)

required
Source code in src/mechaphlowers/data/catalog/catalog.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def remove_duplicates(self, filename: str | PathLike) -> None:
    """Remove duplicate rows, and warn if any duplicates found.

    Args:
        filename (str | PathLike): filename of the csv data source (used only for logging a warning)
    """
    # removing duplicate rows, and warn if any duplicates found
    duplicated = self._data.index.duplicated()
    if duplicated.any():
        self._data = self._data[~duplicated]
        logger.warning(
            f'Duplicate key indices found for catalog {filename}'
        )
        warnings.warn(
            f'Duplicate key indices found for catalog {filename}',
            DataWarning,
        )

rename_columns

rename_columns(
    key_column_name: str, rename_dict: dict
) -> None

Rename the columns and the index of the catalog

Parameters:

Name Type Description Default

key_column_name

str

name of the key index

required

rename_dict

dict

dictionary of all column names that need to be renamed. This can include the key index.

required
Source code in src/mechaphlowers/data/catalog/catalog.py
128
129
130
131
132
133
134
135
136
137
138
def rename_columns(self, key_column_name: str, rename_dict: dict) -> None:
    """Rename the columns and the index of the catalog

    Args:
        key_column_name (str): name of the key index
        rename_dict (dict): dictionary of all column names that need to be renamed. This can include the key index.
    """
    self._data = self._data.rename(columns=rename_dict)
    # also renaming index column
    if key_column_name in rename_dict:
        self._data.index.names = [rename_dict[key_column_name]]

validate_types

validate_types(dtype_dict: dict) -> None

Validate the types of the dataframe. Boolean columns are not checked.

Parameters:

Name Type Description Default

dtype_dict

dict

dictionary of column names and their types, without the key index and the boolean columns.

required
Source code in src/mechaphlowers/data/catalog/catalog.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def validate_types(self, dtype_dict: dict) -> None:
    """Validate the types of the dataframe. Boolean columns are not checked.

    Args:
        dtype_dict (dict): dictionary of column names and their types, without the key index and the boolean columns.
    """
    coerce_dict = {
        str: True,
        int: True,
        float: True,
        bool: False,
    }
    df_schema = pa.DataFrameSchema(
        {
            key: pa.Column(value, nullable=True, coerce=coerce_dict[value])
            for (key, value) in dtype_dict.items()
        },
        index=pa.Index(str),
    )
    df_schema.validate(self._data)

build_catalog_from_yaml

build_catalog_from_yaml(
    yaml_filename: str | PathLike,
    rename=True,
    remove_duplicates=True,
    user_filepath: str | PathLike = DATA_BASE_PATH,
    separator: str = ',',
    decimal: str = '.',
) -> Catalog

Build a catalog from a yaml file.

Parameters:

Name Type Description Default

yaml_filename

str | PathLike

path to the yaml file

required

rename

bool

whether to rename columns according to the yaml file. Defaults to True.

True

remove_duplicates

bool

whether to remove duplicate rows. Defaults to True.

True

user_filepath

str | PathLike

path to the folder containing the yaml file. Defaults to internal data.

DATA_BASE_PATH

Returns:

Name Type Description
Catalog Catalog

a catalog instance with the data from the yaml file

Source code in src/mechaphlowers/data/catalog/catalog.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
def build_catalog_from_yaml(
    yaml_filename: str | PathLike,
    rename=True,
    remove_duplicates=True,
    user_filepath: str | PathLike = DATA_BASE_PATH,
    separator: str = ",",
    decimal: str = ".",
) -> Catalog:
    """Build a catalog from a yaml file.

    Args:
        yaml_filename (str | PathLike): path to the yaml file
        rename (bool): whether to rename columns according to the yaml file. Defaults to True.
        remove_duplicates (bool): whether to remove duplicate rows. Defaults to True.
        user_filepath (str | PathLike): path to the folder containing the yaml file. Defaults to internal data.

    Returns:
        Catalog: a catalog instance with the data from the yaml file
    """

    try:
        yaml_filepath = user_filepath / yaml_filename  # type: ignore[operator]

        with open(yaml_filepath, "r") as file:
            raw_data_yaml: dict = yaml.safe_load(file)
    except FileNotFoundError:
        raise FileNotFoundError(f"File {yaml_filepath} not found")

    string_to_type_converters = {
        "str": str,
        "int": int,
        "float": float,
        "bool": bool,
        str: str,
        int: int,
        float: float,
        bool: bool,
    }
    # fetch data for type validation
    if "columns" in raw_data_yaml:
        columns_types = {
            key: string_to_type_converters[value]
            for list_item in raw_data_yaml["columns"]
            for (key, value) in list_item.items()
        }
    else:
        columns_types = {}
    # fetch data for renaming columns
    # TODO: add test case no rename
    rename_dict = (
        fetch_dict_from_yaml("columns_renaming", raw_data_yaml)
        if rename
        else {}
    )
    # fetch data for input units
    units_dict = fetch_dict_from_yaml("columns_units", raw_data_yaml)
    catalog_type = raw_data_yaml["catalog_type"]
    return Catalog(
        raw_data_yaml["csv_name"],
        raw_data_yaml["key_column_name"],
        columns_types,
        rename_dict,
        units_dict,
        catalog_type,
        remove_duplicates,
        user_filepath,
        separator=separator,
        decimal=decimal,
    )

write_yaml_catalog_template

write_yaml_catalog_template(
    user_filepath: str | PathLike,
    template: Literal[
        'cable_catalog', 'support_catalog'
    ] = 'support_catalog',
) -> None

Write a yaml catalog template file in the user_filepath folder provided.

Parameters:

Name Type Description Default

user_filepath

str | PathLike

path to the folder where the yaml file will be written. Defaults to internal data.

required

template

Literal['cable_catalog', 'support_catalog']

type of the catalog template. Defaults to "support_catalog".

'support_catalog'
Source code in src/mechaphlowers/data/catalog/catalog.py
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def write_yaml_catalog_template(
    user_filepath: str | PathLike,
    template: Literal["cable_catalog", "support_catalog"] = "support_catalog",
) -> None:
    """Write a yaml catalog template file in the user_filepath folder provided.

    Args:
        user_filepath (str | PathLike): path to the folder where the yaml file will be written. Defaults to internal data.
        template (Literal['cable_catalog', 'support_catalog']): type of the catalog template. Defaults to "support_catalog".
    """
    map_catalog = {
        "cable_catalog": "sample_cable_database.yaml",
        "support_catalog": "sample_pylon_database.yaml",
    }

    try:
        filename = map_catalog[template]
    except KeyError as e:
        raise KeyError(
            f"Template '{template}' is not supported. Supported templates are: {list(map_catalog.keys())}"
        ) from e

    if isinstance(user_filepath, str):
        filepath = Path(user_filepath)
    elif isinstance(user_filepath, Path):
        filepath = user_filepath
    else:
        raise TypeError(
            f"Expected a str or Path as argument for 'user_filepath', got {type(user_filepath)}"
        )

    if not filepath.exists():
        raise FileNotFoundError(
            f"{user_filepath=} seems to be not a valid path. Please provide a valid path."
        )

    filepath = filepath / filename  # type: ignore[operator]

    with open(filepath, 'w') as file:
        yaml.dump(template, file)