Machine Learning full path
05:50 26 Mar 2026
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, classification_report, confusion_matrix

Loading the data

#1 Load and quick Check

df = pd.read_csv(data_path)

df.head()
df.info()
df.describe()
df.isnull().sum()

These are code parts of what building

#2 Data Cleaning

#Numerical cols
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

#Categorical cols
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

Seperating features and target

#4 Seperate features and Target
X  = df.drop(['customer_id', 'target_column'], axis=1)
y = df['target_column']

numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical-cols = X.select_dtypes(include=['object']).columns.tolist()

Pipeline creation

#5 Pipeline creation

#Numerical pipeliune
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#Categorical pipeline
categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_values='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categoric_pipeline, categorical_cols)
])

Train test split code

#6 Train Test Split

X_train, X_test, y_train, y_test= train_test_split(
    X, y, test_size = 0.2, random_state=42
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomforestClassifier(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

if len(set(y))  < 10:
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
else:
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"r2: (r2:.4f), RMSE: {rmse:.4f}")
1.Handling missing values(mean/median/mode imputation, dropping)
2.Encoding categorical values(Onehot , label)
3.Feature scaling
4.New feature creation
5.Handling outliers
6.Train test split

Above gives the full pathway of the code
How to correctly choose the correct model?

#Model Selection

#Classification
models = {
    'logistic' : LogisticRegression(max_iter=1000),
    'knn' : KNeighborsClassifier(n_neighbors=5),
    'decision_tree' : DecisionTreeClassifier(max_depth=5),
    'random_forest': RandomForestClassifier(n_estimators=100)
}

#Regression
models = {
    'linear' : LinearRegression(),
    'ridge' : Ridge(alpha=1.0),
    'decision_tree' : DecisionTreeRegressor(max_depth=5),
    'random_forest': RandomForestRegressor(n_estimators = 100)
}

Check the above code and where and how to add more and where to update any feature engineering we could do?

python machine-learning scikit-learn data-science sklearn-pandas