Source code for sevnpy.io.regexutility

"""
==============================================================
Regex utility, (:mod:`sevny.io.regexutility`)
==============================================================

This module contains methods and utilities to use regex

"""

from typing import Union, Type, List, Optional, Any, Dict
import pandas as pd
import numpy as np
import re
#from ..utility import utility as ut
from .. import utility as ut


[docs] def regex_dataframe_from_string(string: str, matchexpr: str, columns: List[str], columns_type: Optional[List[str]] = None) -> pd.DataFrame: """ Apply a regex match to a string and store the captured values to a pandas DataFrame Parameters ---------- string: string to read matchexpr: regex expression to match columns: name of the capturing groups from the regex match columns_type: type of the capturing groups from the regex match Returns ------- matching_value: pandas DataFrame Return a pandas DataFrame storing the values matched by the regex pattern. """ ma = re.findall(matchexpr, string) na = np.array(ma) if (len(na) == 0): _df = pd.DataFrame({name: [] for name in columns}) else: if len(columns) != na.shape[1]: raise ValueError( f"The dimension of the columns in input {columns} is not consistent" f"with the dimension of the captured items") if columns_type is not None: if len(columns_type) != len(columns): raise ValueError( f"The number of the column types in input {columns_type} is not consistent" f"with the number of the columns") _df = pd.DataFrame(na, columns=columns) if columns_type is not None: type_dict = dict(zip(columns, columns_type)) _df = _df.astype(type_dict) return _df
[docs] def regex_from_file(filename: str, matchexpr: str, columns: List[str], columns_type: Optional[List[str]] = None) -> pd.DataFrame: """ Apply a regex match to the content of a file and store the captured values to a pandas DataFrame Parameters ---------- filename: file to read matchexpr: regex expression to match columns: name of the capturing groups from the regex match columns_type: type of the capturing groups from the regex match Returns ------- matching_value: pandas DataFrame Return a pandas DataFrame storing the values matched by the regex pattern. """ with open(filename, "r") as fo: df = regex_dataframe_from_string(string=fo.read(), matchexpr=matchexpr, columns=columns, columns_type=columns_type) return df
capturing = lambda value: f"({value})" """Return the string (value) where value is the input""" notcapturing = lambda value: f"(?:{value})" """Return the string (?:value) where value is the input"""
[docs] class ReTypeMatch: """ This class is a pure static class used to retrieve the regex matching pattern for various types. At the moment the allowed types are: - **'int'**, **'type'** or **int**: matching pattern for an integer numer - **'id'**: matching pattern for an ID type, i.e. a positive int - **'float'** or **float**: matching pattern for a float number - **'str'** or **'name'**: generic matching pattern for a string Examples -------- ReTypeMatch can be used directly as dictionary providing the type of matching pattern we want to retrieve, e.g. >>> ReTypeMatch["str"] '(?:[0-9|A-Za-z]*\\_)?[0-9]*' >>> ReTypeMatch[float] '[+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan)' It is also possibile to guess the matching patter for a given generic input using the static method guess >>> ReTypeMatch.guess("hello world") '(?:[0-9|A-Za-z]*\\_)?[0-9]*' >>> ReTypeMatch.guess("3e10") '[+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan)' >>> ReTypeMatch.guess(13.2) '[+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan)' >>> ReTypeMatch.guess(2) '[+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan)' """ matchtype = r"[+|-]?\d+" matchnum = r"[+|-]?[0-9]+\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan)" matchid = r"[0-9]+" matchname = r"(?:[0-9|A-Za-z]*\_)?[0-9]*" transform_dict = { "int": matchtype, "float": matchnum, "id": matchid, "type": matchtype, "str": matchname, "name": matchname, int: matchtype, float: matchnum, str: matchname }
[docs] @classmethod def guess(cls, guess_value: Union[int, float, str]) -> str: """ Guess the regex pattern to match in the input value Parameters ---------- guess_value: int|float|str Input value for which we want to guess the related pattern matching Returns ------- regex_pattern: str The regex pattern matching for the input value """ if isinstance(guess_value, int): return cls.transform_dict[int] elif isinstance(guess_value, float): return cls.transform_dict[float] elif isinstance(guess_value, str) and guess_value.isdigit(): return cls.transform_dict[int] elif isinstance(guess_value, str) and ut.str_is_float(guess_value): return cls.transform_dict[float] elif isinstance(guess_value, str): return cls.transform_dict[str] else: raise ValueError(f"Regex pattern match for input value {guess_value} cannot be guessed")
def __class_getitem__(cls, key: Union[str, Type[int], Type[float], Type[str]]) -> str: try: return cls.transform_dict[key] except KeyError: raise KeyError(f"key \'{key}\' not available.Available keys are {list(cls.transform_dict.keys())}") return cls.transform_dict[key]
[docs] @classmethod def capturing(cls, key: Union[str, Type[int], Type[float], Type[str]]) -> str: """ Get the regex matching patter considering a capturing group Parameters ---------- key: str or type int,str,float key for which we want to get the matching pattern Returns ------- matching_pattern: str return the matching pattern for a capturing group (match_pattern) Examples -------- >>> ReTypeMatch.capturing(float) >>> '([+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan))' """ return capturing(cls[key])
[docs] @classmethod def notcapturing(cls, key: Union[str, Type[int], Type[float], Type[str]]) -> str: """ Get the regex matching patter considering a non-capturing group Parameters ---------- key: str or type int,str,float key for which we want to get the matching pattern Returns ------- matchin_pattern: str return the matching pattern for a non-capturing group (?:match_pattern) Examples -------- >>> ReTypeMatch.capturing(float) >>> '(?:[+|-]?[0-9]+\\.?[0-9]*(?i:e)?[+|-]?[0-9]*|(?i:nan))' """ return notcapturing(cls[key])