from ucimlrepo import fetch_ucirepo # источник данных, тут могла быть sqlalchemyimport pandas as pd # библиотека для работы с датафреймамиimport ipytest # запуск тестов в ноутбукахfrom pathlib import Path # работа с путямиimport logging # логированиеfrom datetime import datetime # работа с датамиimport ipynbname # для получения названия ноутбукаfrom typing import Optional # для аннотации типов
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', # Set the log message formatdatefmt='%Y-%m-%d %H:%M:%S', # Set the date formatlevel=logging.INFO # Set the logging level to INFO)# Create a logger objectlogger = logging.getLogger(__name__)logger.info("Logger has been configured successfully.")
2024-11-04 19:36:07 INFO Logger has been configured successfully.
ROOT = Path(".").absolute().parent.parent # абсолютный путь к папке проектаDATA = ROOT / "data" # папка с даннымиDATA_RAW = DATA / "raw" # папка для хранения сырых данных
def get_timestamp() -> str:"""Generates a timestamp string in the format YYYY-MM-DD-HH-MM-SS.Returns:str: The formatted timestamp."""dt = datetime.now()return dt.strftime("%Y-%m-%d-%H-%M-%S")
def get_filename(fname: Optional[str] = None,extension: str = 'parquet',) -> str:"""Generates a filename based on the provided name and current timestamp.Args:fname (Optional[str]): The base filename. If None, the current notebook name is used.extension (str): The file extension to use. Default is 'parquet'.Returns:str: The generated filename with timestamp and extension."""base_filename: str = fname if fname else ipynbname.name()return f"{base_filename}_{get_timestamp()}.{extension}"
%%ipytestdef test_no_duplicates():assert df.duplicated().sum() == 0
fname = get_filename() # название файлаlogger.info("Saving data to a file: %s", fname)try:df.to_parquet(DATA_RAW / fname) # сохранениеlogger.info("Data saved to %s", DATA_RAW)except:logger.error("Error. Data was not saved.")
2024-11-04 19:36:13 INFO Saving data to a file: wine_data_2024-11-04-19-36-13.parquet2024-11-04 19:36:14 INFO Data saved to E:\edu\wine_project\data\raw
train = df.sample(frac=0.7,random_state=123).copy()test = df[~df.index.isin(train.index)].copy()
def add_coefficients(df: pd.DataFrame) -> pd.DataFrame:selected_pairs = set([('fixed_acidity', 'alcohol'),('volatile_acidity', 'alcohol'),('chlorides', 'pH'),('chlorides', 'alcohol'),('total_sulfur_dioxide', 'sulphates'),('density', 'alcohol'),('pH', 'alcohol'),('residual_sugar', 'chlorides'),('residual_sugar', 'density'),('residual_sugar', 'sulphates'),('total_sulfur_dioxide', 'density'),('pH', 'sulphates'),('fixed_acidity', 'free_sulfur_dioxide'),('fixed_acidity', 'alcohol'),('volatile_acidity', 'free_sulfur_dioxide'),('density', 'alcohol'),('pH', 'alcohol'),])_df = df.copy()for f1, f2 in selected_pairs:_df[f"{f1}_DIV_{f2}"] = _df[f1] / _df[f2]return _df
fname = get_filename('coefficients_mentrcs', extension='xlsx')coef_metrics.to_excel(DATA_CLEAN / fname)logger.info("%s saved to data/clean", fname)
fname = get_filename()df_to_save.to_parquet(DATA_PROCESSED / fname)logger.info("%s saved to data/processed", fname)
df = pd.read_parquet(DATA_PROCESSED / 'enrich_features_2024-11-06-11-28-24.parquet')train = df[df.is_train==1].copy()test = df[df.is_train==0].copy()
x_columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar','chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density','pH', 'sulphates', 'alcohol', 'density_DIV_alcohol', 'residual_sugar_DIV_density','volatile_acidity_DIV_free_sulfur_dioxide', 'fixed_acidity_DIV_alcohol','residual_sugar_DIV_sulphates', 'pH_DIV_sulphates','chlorides_DIV_alcohol', 'total_sulfur_dioxide_DIV_density','fixed_acidity_DIV_free_sulfur_dioxide', 'chlorides_DIV_pH','pH_DIV_alcohol', 'total_sulfur_dioxide_DIV_sulphates','residual_sugar_DIV_chlorides', 'volatile_acidity_DIV_alcohol']X_train = train[x_columns].copy()y_train = train.quality.values.copy()X_test = test[x_columns].copy()y_test = test.quality.values.copy()pipe = DecisionTreeClassifier(random_state=123, max_depth=4, min_impurity_decrease=0.001)pipe.fit(X_train, y_train)
fname = get_filename(extension='joblib')dump(pipe, ARTIFACTS / fname)logger.info('saved to %s', fname)>> 2024-12-01 17:56:41 INFO saved to classifier_2024-12-01-17-56-41.joblib