Spaceship Titanic

Shows the usage of aiking library on a kaggle dataset

Import Public Packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, plot_confusion_matrix, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Import Private Packages

is_kaggle = 'kaggle_secrets' in sys.modules

if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::spaceship-titanic"); path.ls()

from aiking.ml.structured import *
from aiking.integrations.kaggle import push2kaggle

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"spaceship-titanic"
path.ls()

(#3) [Path('/kaggle/input/spaceship-titanic/sample_submission.csv'),Path('/kaggle/input/spaceship-titanic/test.csv'),Path('/kaggle/input/spaceship-titanic/train.csv')]

# !rm -rf  {path}

df_train = pd.read_csv(path/"train.csv"); df_train.head().T

	0	1	2	3	4
PassengerId	0001_01	0002_01	0003_01	0003_02	0004_01
HomePlanet	Europa	Earth	Europa	Europa	Earth
CryoSleep	False	False	False	False	False
Cabin	B/0/P	F/0/S	A/0/S	A/0/S	F/1/S
Destination	TRAPPIST-1e	TRAPPIST-1e	TRAPPIST-1e	TRAPPIST-1e	TRAPPIST-1e
Age	39.0	24.0	58.0	33.0	16.0
VIP	False	False	True	False	False
RoomService	0.0	109.0	43.0	0.0	303.0
FoodCourt	0.0	9.0	3576.0	1283.0	70.0
ShoppingMall	0.0	25.0	0.0	371.0	151.0
Spa	0.0	549.0	6715.0	3329.0	565.0
VRDeck	0.0	44.0	49.0	193.0	2.0
Name	Maham Ofracculy	Juanna Vines	Altark Susent	Solam Susent	Willy Santantines
Transported	False	True	False	False	True

df_test = pd.read_csv(path/"test.csv"); df_test.head().T

	0	1	2	3	4
PassengerId	0013_01	0018_01	0019_01	0021_01	0023_01
HomePlanet	Earth	Earth	Europa	Europa	Earth
CryoSleep	True	False	True	False	False
Cabin	G/3/S	F/4/S	C/0/S	C/1/S	F/5/S
Destination	TRAPPIST-1e	TRAPPIST-1e	55 Cancri e	TRAPPIST-1e	TRAPPIST-1e
Age	27.0	19.0	31.0	38.0	20.0
VIP	False	False	False	False	False
RoomService	0.0	0.0	0.0	0.0	10.0
FoodCourt	0.0	9.0	0.0	6652.0	0.0
ShoppingMall	0.0	0.0	0.0	0.0	635.0
Spa	0.0	2823.0	0.0	181.0	0.0
VRDeck	0.0	0.0	0.0	585.0	0.0
Name	Nelly Carsoning	Lerome Peckers	Sabih Unhearfus	Meratz Caltilter	Brence Harperez

Feature Engineering

def split_col(X, splitter=" "): return X.squeeze().str.split(splitter, expand=True).apply(pd.to_numeric, errors='ignore', downcast='integer')
split_col(df_train[['PassengerId']], splitter='_')

	0	1
0	1	1
1	2	1
2	3	1
3	3	2
4	4	1
...	...	...
8688	9276	1
8689	9278	1
8690	9279	1
8691	9280	1
8692	9280	2

8693 rows × 2 columns

passenger_transformer = ColExpanderTransform(names=['Passenger_gggg', 'Passenger_nn'], func=split_col, func_kw_args={"splitter":"_"})
display(passenger_transformer.fit_transform(df_train[['PassengerId']]), passenger_transformer.get_feature_names())

	0	1
0	1	1
1	2	1
2	3	1
3	3	2
4	4	1
...	...	...
8688	9276	1
8689	9278	1
8690	9279	1
8691	9280	1
8692	9280	2

8693 rows × 2 columns

['Passenger_gggg', 'Passenger_nn']

def calc_service_cost(X, cols=[]):
    return X[cols].sum(axis=1).to_frame()

cols_sc = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

calc_service_cost(df_train, cols_sc).head()

	0
0	0.0
1	736.0
2	10383.0
3	5176.0
4	1091.0

layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}), 
                        'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
                        'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
                    }
                })
layer_spec_calc = (gen_feature_layer,
                {
                    'feature_specs':{
                        str(cols_sc):(calc_service_cost,  {'cols':cols_sc}, {"alias":'ServiceCost'})
                    }
                }
               )
layer_specs = [layer_spec_preprocess, layer_spec_calc]
proc = Proc(layer_specs=layer_specs)
proc.fit_transform(df_train)

	RoomService_FoodCourt_ShoppingMall_Spa_VRDeck	PassengerId_gggg	PassengerId_nn	Cabin_deck	Cabin_num	Cabin_side	Name_first	Name_last	HomePlanet	CryoSleep	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Transported
0	0.0	1	1	B	0.0	P	Maham	Ofracculy	Europa	False	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	False
1	736.0	2	1	F	0.0	S	Juanna	Vines	Earth	False	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	True
2	10383.0	3	1	A	0.0	S	Altark	Susent	Europa	False	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	False
3	5176.0	3	2	A	0.0	S	Solam	Susent	Europa	False	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	False
4	1091.0	4	1	F	1.0	S	Willy	Santantines	Earth	False	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8688	8536.0	9276	1	A	98.0	P	Gravior	Noxnuther	Europa	False	55 Cancri e	41.0	True	0.0	6819.0	0.0	1643.0	74.0	False
8689	0.0	9278	1	G	1499.0	S	Kurta	Mondalley	Earth	True	PSO J318.5-22	18.0	False	0.0	0.0	0.0	0.0	0.0	False
8690	1873.0	9279	1	G	1500.0	S	Fayey	Connon	Earth	False	TRAPPIST-1e	26.0	False	0.0	0.0	1872.0	1.0	0.0	True
8691	4637.0	9280	1	E	608.0	S	Celeon	Hontichre	Europa	False	55 Cancri e	32.0	False	0.0	1049.0	0.0	353.0	3235.0	False
8692	4826.0	9280	2	E	608.0	S	Propsh	Hontichre	Europa	False	TRAPPIST-1e	44.0	False	126.0	4688.0	0.0	0.0	12.0	True

8693 rows × 19 columns

Modeling and Evaluation

def get_pipeline(max_n_cat=0, 
                 cat_dict=None, 
                 scale_var_cat=False,
                 scale_dict={'class': StandardScaler},
                 cat_num_dict={'class':NumericalEncoder,'categories':None},
                 cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                 imputer_dict={'class':SimpleImputer, 'strategy':'median'}):
    
    
    layer_spec_preprocess = (gen_feature_layer,
                {
                    'feature_specs':{
                        'PassengerId':(ColExpanderTransform, {'names':['gggg', 'nn'], 'func':split_col, 'func_kw_args':{"splitter":"_"}}), 
                        'Cabin':(ColExpanderTransform, {'names':['deck', 'num', 'side'], 'func':split_col, 'func_kw_args':{"splitter":"/"}}),
                        'Name':(ColExpanderTransform, {'names':['first', 'last'], 'func':split_col, 'func_kw_args':{"splitter":" "}}),
                    }
                })
    
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':scale_var_cat,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )
    
    layer_spec_calc = (gen_feature_layer,
                    {
                        'feature_specs':{
                            str(cols_sc):(calc_service_cost,  {'cols':cols_sc}, {"alias":'ServiceCost'})
                        }
                    }
                   )

    layer_specs = [layer_spec_preprocess , layer_spec_default, layer_spec_calc]
    # layer_specs = [layer_spec_preprocess]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestClassifier()
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline

X = df_train.drop('Transported', axis=1)
y = df_train[['Transported']]
display(X.head(), y.head())

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name
0	0001_01	Europa	False	B/0/P	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	Maham Ofracculy
1	0002_01	Earth	False	F/0/S	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	Juanna Vines
2	0003_01	Europa	False	A/0/S	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	Altark Susent
3	0003_02	Europa	False	A/0/S	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	Solam Susent
4	0004_01	Earth	False	F/1/S	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	Willy Santantines

	Transported
0	False
1	True
2	False
3	False
4	True

pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores

array([0.73260495, 0.73950546, 0.80448534, 0.82220944, 0.76524741])

sklearn.__version__

'0.24.2'

y_pred = cross_val_predict(pipeline, X, y)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>

print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")

Expected Scores 66.69% to 87.87% with mean as 77.28%

Predictions

Retrain Pipeline on complete training data

pipeline = get_pipeline(max_n_cat=20, scale_var_cat=False)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>

1.0

Calculations for Test Set and Submission

predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions

	Transported
0	False
1	False
2	True
3	True
4	False
...	...
4272	True
4273	False
4274	True
4275	True
4276	False

4277 rows × 1 columns

submission = pd.concat([df_test['PassengerId'], predictions], axis=1); submission

	PassengerId	Transported
0	0013_01	False
1	0018_01	False
2	0019_01	True
3	0021_01	True
4	0023_01	False
...	...	...
4272	9266_02	True
4273	9269_01	False
4274	9271_01	True
4275	9273_01	True
4276	9277_01	False

4277 rows × 2 columns

if is_kaggle: submission.to_csv('submission.csv', index=False)
else: push2kaggle('index.ipynb')

Kernel version 13 successfully pushed.  Please check progress at https://www.kaggle.com/code/rahuketu86/spaceship-titanic