Skip to content

Utils

get_date_from_file(filename)

Extract date from filename.

Parameters:

Name Type Description Default
filenames List[str]

A list of filenames.

required

Returns:

Type Description
datetime

List[str]: A list of dates extracted from the filenames.

Source code in rs_tools/_src/datamodule/utils.py
def get_date_from_file(filename: str) -> datetime:
    """
    Extract date from filename.

    Args:
        filenames (List[str]): A list of filenames.

    Returns:
        List[str]: A list of dates extracted from the filenames.
    """
    date = datetime.strptime(filename.split("_")[0], "%Y%m%d%H%M%S")
    return date

get_dates_from_files(filenames)

Extract dates from a list of filenames.

Parameters:

Name Type Description Default
filenames List[str]

A list of filenames.

required

Returns:

Type Description
List[datetime]

List[str]: A list of dates extracted from the filenames.

Source code in rs_tools/_src/datamodule/utils.py
def get_dates_from_files(filenames: List[str]) -> List[datetime]:
    """
    Extract dates from a list of filenames.

    Args:
        filenames (List[str]): A list of filenames.

    Returns:
        List[str]: A list of dates extracted from the filenames.
    """
    dates = [datetime.strptime(filename.split("_")[0], "%Y%m%d%H%M%S") for filename in filenames]
    return dates

get_split(files, split_dict)

Split files based on dataset specification.

Parameters:

Name Type Description Default
files List

A list of files to be split.

required
split_dict DictConfig

A dictionary-like object containing the dataset specification.

required

Returns:

Type Description
Tuple[List, List]

Tuple[List, List]: A tuple containing two lists: the training set and the validation set.

Source code in rs_tools/_src/datamodule/utils.py
def get_split(files: List, 
              split_dict: DictConfig) -> Tuple[List, List]:
    """
    Split files based on dataset specification.

    Args:
        files (List): A list of files to be split.
        split_dict (DictConfig): A dictionary-like object containing the dataset specification.

    Returns:
        Tuple[List, List]: A tuple containing two lists: the training set and the validation set.
    """
    # Extract dates from filenames
    filenames = [file.split("/")[-1] for file in files]
    dates = get_dates_from_files(filenames)
    # Convert to dataframe for easier manipulation
    df = pd.DataFrame({"filename": filenames, "files": files, "date": dates})

    # Check if years, months, and days are specified
    if "years" not in split_dict.keys() or split_dict["years"] is None:
        logger.info("No years specified for split. Using all years.")
        split_dict["years"] = df.date.dt.year.unique().tolist()
    if "months" not in split_dict.keys() or split_dict["months"] is None:
        logger.info("No months specified for split. Using all months.")
        split_dict["months"] = df.date.dt.month.unique().tolist()
    if "days" not in split_dict.keys() or split_dict["days"] is None:
        logger.info("No days specified for split. Using all days.")
        split_dict["days"] = df.date.dt.day.unique().tolist()

    # Determine conditions specified split
    condition = (df.date.dt.year.isin(split_dict["years"])) & \
                (df.date.dt.month.isin(split_dict["months"])) & \
                (df.date.dt.day.isin(split_dict["days"]))

    # Extract filenames based on conditions
    split_files = df[condition].files.tolist()

    # Check if files are allocated properly
    if len(split_files) == 0:
        raise ValueError("No files found. Check split specification.")

    return split_files

split_train_val(files, split_spec)

Split files into training and validation sets based on dataset specification.

Parameters:

Name Type Description Default
files List

A list of files to be split.

required
split_spec DictConfig

A dictionary-like object containing the dataset specification.

required

Returns:

Type Description
Tuple[List, List]

Tuple[List, List]: A tuple containing two lists: the training set and the validation set.

Source code in rs_tools/_src/datamodule/utils.py
def split_train_val(files: List, split_spec: DictConfig) -> Tuple[List, List]:
    """
    Split files into training and validation sets based on dataset specification.

    Args:
        files (List): A list of files to be split.
        split_spec (DictConfig): A dictionary-like object containing the dataset specification.

    Returns:
        Tuple[List, List]: A tuple containing two lists: the training set and the validation set.
    """
    if "train" not in split_spec.keys() or "val" not in split_spec.keys():
        raise ValueError("split_spec must contain 'train' and 'val' keys")

    train_files = get_split(files, split_spec["train"])
    val_files = get_split(files, split_spec["val"])

    return train_files, val_files