Machine Learning full path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, classification_report, confusion_matrix
Loading the data
#1 Load and quick Check
df = pd.read_csv(data_path)
df.head()
df.info()
df.describe()
df.isnull().sum()
These are code parts of what building
#2 Data Cleaning
#Numerical cols
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
#Categorical cols
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
df[col] = df[col].fillna(df[col].mode()[0])
Seperating features and target
#4 Seperate features and Target
X = df.drop(['customer_id', 'target_column'], axis=1)
y = df['target_column']
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical-cols = X.select_dtypes(include=['object']).columns.tolist()
Pipeline creation
#5 Pipeline creation
#Numerical pipeliune
numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
#Categorical pipeline
categoric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_values='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
('num', numeric_pipeline, numeric_cols),
('cat', categoric_pipeline, categorical_cols)
])
Train test split code
#6 Train Test Split
X_train, X_test, y_train, y_test= train_test_split(
X, y, test_size = 0.2, random_state=42
)
model = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomforestClassifier(n_estimators=100, random_state=42))
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if len(set(y)) < 10:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
else:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"r2: (r2:.4f), RMSE: {rmse:.4f}")
1.Handling missing values(mean/median/mode imputation, dropping)
2.Encoding categorical values(Onehot , label)
3.Feature scaling
4.New feature creation
5.Handling outliers
6.Train test split
Above gives the full pathway of the code
How to correctly choose the correct model?
#Model Selection
#Classification
models = {
'logistic' : LogisticRegression(max_iter=1000),
'knn' : KNeighborsClassifier(n_neighbors=5),
'decision_tree' : DecisionTreeClassifier(max_depth=5),
'random_forest': RandomForestClassifier(n_estimators=100)
}
#Regression
models = {
'linear' : LinearRegression(),
'ridge' : Ridge(alpha=1.0),
'decision_tree' : DecisionTreeRegressor(max_depth=5),
'random_forest': RandomForestRegressor(n_estimators = 100)
}
Check the above code and where and how to add more and where to update any feature engineering we could do?