import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
Spaceship Titanic
Shows the usage of aiking library on a kaggle dataset
Import Public Packages
Import Private Packages
= 'kaggle_secrets' in sys.modules is_kaggle
if is_kaggle:
from kaggle_secrets import UserSecretsClient
= UserSecretsClient()
user_secrets 'KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
os.environ[if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
'KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
os.environ[= user_secrets.get_secret("GITHUB_PAT")
github_pat !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
from aiking.data.external import *
= untar_data("kaggle_competitions::spaceship-titanic"); path.ls() path
from aiking.ml.structured import *
from aiking.integrations.kaggle import push2kaggle
Read the Dataset
= pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input"));
data_dir = data_dir/"spaceship-titanic"
path path.ls()
(#3) [Path('/kaggle/input/spaceship-titanic/sample_submission.csv'),Path('/kaggle/input/spaceship-titanic/test.csv'),Path('/kaggle/input/spaceship-titanic/train.csv')]
# !rm -rf {path}
= pd.read_csv(path/"train.csv"); df_train.head().T df_train
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
PassengerId | 0001_01 | 0002_01 | 0003_01 | 0003_02 | 0004_01 |
HomePlanet | Europa | Earth | Europa | Europa | Earth |
CryoSleep | False | False | False | False | False |
Cabin | B/0/P | F/0/S | A/0/S | A/0/S | F/1/S |
Destination | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e | TRAPPIST-1e |
Age | 39.0 | 24.0 | 58.0 | 33.0 | 16.0 |
VIP | False | False | True | False | False |
RoomService | 0.0 | 109.0 | 43.0 | 0.0 | 303.0 |
FoodCourt | 0.0 | 9.0 | 3576.0 | 1283.0 | 70.0 |
ShoppingMall | 0.0 | 25.0 | 0.0 | 371.0 | 151.0 |
Spa | 0.0 | 549.0 | 6715.0 | 3329.0 | 565.0 |
VRDeck | 0.0 | 44.0 | 49.0 | 193.0 | 2.0 |
Name | Maham Ofracculy | Juanna Vines | Altark Susent | Solam Susent | Willy Santantines |
Transported | False | True | False | False | True |
= pd.read_csv(path/"test.csv"); df_test.head().T df_test
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
PassengerId | 0013_01 | 0018_01 | 0019_01 | 0021_01 | 0023_01 |
HomePlanet | Earth | Earth | Europa | Europa | Earth |
CryoSleep | True | False | True | False | False |
Cabin | G/3/S | F/4/S | C/0/S | C/1/S | F/5/S |
Destination | TRAPPIST-1e | TRAPPIST-1e | 55 Cancri e | TRAPPIST-1e | TRAPPIST-1e |
Age | 27.0 | 19.0 | 31.0 | 38.0 | 20.0 |
VIP | False | False | False | False | False |
RoomService | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 |
FoodCourt | 0.0 | 9.0 | 0.0 | 6652.0 | 0.0 |
ShoppingMall | 0.0 | 0.0 | 0.0 | 0.0 | 635.0 |
Spa | 0.0 | 2823.0 | 0.0 | 181.0 | 0.0 |
VRDeck | 0.0 | 0.0 | 0.0 | 585.0 | 0.0 |
Name | Nelly Carsoning | Lerome Peckers | Sabih Unhearfus | Meratz Caltilter | Brence Harperez |
Feature Engineering
def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
'PassengerId']], splitter='_') split_col(df_train[[
0 | 1 | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 1 |
2 | 3 | 1 |
3 | 3 | 2 |
4 | 4 | 1 |
... | ... | ... |
8688 | 9276 | 1 |
8689 | 9278 | 1 |
8690 | 9279 | 1 |
8691 | 9280 | 1 |
8692 | 9280 | 2 |
8693 rows × 2 columns
= ColExpanderTransform(names=['Passenger_gggg', 'Passenger_nn'], func=split_col, func_kw_args={"splitter":"_"})
passenger_transformer 'PassengerId']]), passenger_transformer.get_feature_names()) display(passenger_transformer.fit_transform(df_train[[
0 | 1 | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 1 |
2 | 3 | 1 |
3 | 3 | 2 |
4 | 4 | 1 |
... | ... | ... |
8688 | 9276 | 1 |
8689 | 9278 | 1 |
8690 | 9279 | 1 |
8691 | 9280 | 1 |
8692 | 9280 | 2 |
8693 rows × 2 columns
['Passenger_gggg', 'Passenger_nn']
def calc_service_cost(X, cols=[]):
return X[cols].sum(axis=1).to_frame()
= ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cols_sc
calc_service_cost(df_train, cols_sc).head()
0 | |
---|---|
0 | 0.0 |
1 | 736.0 |
2 | 10383.0 |
3 | 5176.0 |
4 | 1091.0 |
= (gen_feature_layer,
layer_spec_preprocess
{'feature_specs':{
'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}),
'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
}
})= (gen_feature_layer,
layer_spec_calc
{'feature_specs':{
str(cols_sc):(calc_service_cost, {'cols':cols_sc}, {"alias":'ServiceCost'})
}
}
)= [layer_spec_preprocess, layer_spec_calc]
layer_specs = Proc(layer_specs=layer_specs)
proc proc.fit_transform(df_train)
RoomService_FoodCourt_ShoppingMall_Spa_VRDeck | PassengerId_gggg | PassengerId_nn | Cabin_deck | Cabin_num | Cabin_side | Name_first | Name_last | HomePlanet | CryoSleep | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Transported | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1 | 1 | B | 0.0 | P | Maham | Ofracculy | Europa | False | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False |
1 | 736.0 | 2 | 1 | F | 0.0 | S | Juanna | Vines | Earth | False | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | True |
2 | 10383.0 | 3 | 1 | A | 0.0 | S | Altark | Susent | Europa | False | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | False |
3 | 5176.0 | 3 | 2 | A | 0.0 | S | Solam | Susent | Europa | False | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | False |
4 | 1091.0 | 4 | 1 | F | 1.0 | S | Willy | Santantines | Earth | False | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8688 | 8536.0 | 9276 | 1 | A | 98.0 | P | Gravior | Noxnuther | Europa | False | 55 Cancri e | 41.0 | True | 0.0 | 6819.0 | 0.0 | 1643.0 | 74.0 | False |
8689 | 0.0 | 9278 | 1 | G | 1499.0 | S | Kurta | Mondalley | Earth | True | PSO J318.5-22 | 18.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False |
8690 | 1873.0 | 9279 | 1 | G | 1500.0 | S | Fayey | Connon | Earth | False | TRAPPIST-1e | 26.0 | False | 0.0 | 0.0 | 1872.0 | 1.0 | 0.0 | True |
8691 | 4637.0 | 9280 | 1 | E | 608.0 | S | Celeon | Hontichre | Europa | False | 55 Cancri e | 32.0 | False | 0.0 | 1049.0 | 0.0 | 353.0 | 3235.0 | False |
8692 | 4826.0 | 9280 | 2 | E | 608.0 | S | Propsh | Hontichre | Europa | False | TRAPPIST-1e | 44.0 | False | 126.0 | 4688.0 | 0.0 | 0.0 | 12.0 | True |
8693 rows × 19 columns
Modeling and Evaluation
def get_pipeline(max_n_cat=0,
=None,
cat_dict=False,
scale_var_cat={'class': StandardScaler},
scale_dict={'class':NumericalEncoder,'categories':None},
cat_num_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
cat_dummy_dict={'class':SimpleImputer, 'strategy':'median'}):
imputer_dict
= (gen_feature_layer,
layer_spec_preprocess
{'feature_specs':{
'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}),
'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
}
})
= (get_default_feature_def,
layer_spec_default
{'skip_flds':None,
'ignored_flds':None,
'max_n_cat':max_n_cat,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':scale_var_cat,
'scale_dict':scale_dict,
'cat_num_dict':cat_num_dict,
'cat_dummy_dict':cat_dummy_dict,
'imputer_dict':imputer_dict,
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':cat_dict
}
)
= (gen_feature_layer,
layer_spec_calc
{'feature_specs':{
str(cols_sc):(calc_service_cost, {'cols':cols_sc}, {"alias":'ServiceCost'})
}
}
)
= [layer_spec_preprocess , layer_spec_default, layer_spec_calc]
layer_specs # layer_specs = [layer_spec_preprocess]
= Proc(layer_specs=layer_specs); #proc.fit_transform(X)
proc = RandomForestClassifier()
model = make_pipeline(proc, model); pipeline
pipeline return pipeline
= df_train.drop('Transported', axis=1)
X = df_train[['Transported']]
y display(X.head(), y.head())
PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy |
1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines |
2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent |
3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent |
4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines |
Transported | |
---|---|
0 | False |
1 | True |
2 | False |
3 | False |
4 | True |
= get_pipeline(max_n_cat=20, scale_var_cat=False)
pipeline = cross_val_score(pipeline, X, y, scoring='accuracy'); scores scores
array([0.73260495, 0.73950546, 0.80448534, 0.82220944, 0.76524741])
sklearn.__version__
'0.24.2'
= cross_val_predict(pipeline, X, y)
y_pred = confusion_matrix(y, y_pred)
cm = ConfusionMatrixDisplay(confusion_matrix=cm)
disp disp.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")
Expected Scores 66.69% to 87.87% with mean as 77.28%
Predictions
Retrain Pipeline on complete training data
= get_pipeline(max_n_cat=20, scale_var_cat=False)
pipeline
pipeline.fit(X, y)= pipeline.predict(X)
y_pred = confusion_matrix(y, y_pred)
cm = ConfusionMatrixDisplay(confusion_matrix=cm)
disp display(disp.plot(), accuracy_score(y, y_pred))
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
1.0
Calculations for Test Set and Submission
= pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions predictions
Transported | |
---|---|
0 | False |
1 | False |
2 | True |
3 | True |
4 | False |
... | ... |
4272 | True |
4273 | False |
4274 | True |
4275 | True |
4276 | False |
4277 rows × 1 columns
= pd.concat([df_test['PassengerId'], predictions], axis=1); submission submission
PassengerId | Transported | |
---|---|---|
0 | 0013_01 | False |
1 | 0018_01 | False |
2 | 0019_01 | True |
3 | 0021_01 | True |
4 | 0023_01 | False |
... | ... | ... |
4272 | 9266_02 | True |
4273 | 9269_01 | False |
4274 | 9271_01 | True |
4275 | 9273_01 | True |
4276 | 9277_01 | False |
4277 rows × 2 columns
if is_kaggle: submission.to_csv('submission.csv', index=False)
else: push2kaggle('index.ipynb')
Kernel version 13 successfully pushed. Please check progress at https://www.kaggle.com/code/rahuketu86/spaceship-titanic