In [96]:
#titanic predictions using SVM classifier

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV,  RandomizedSearchCV
pd.options.display.max_rows=999
pd.options.display.max_columns=999

#read supplied files and concat
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train_size = train.shape[0]
df = pd.concat([train, test], axis=0)
df['Name'] = df['Name'].str.replace('\"','').str.strip()

#read other file to add survived flag to the test dataframe
dftotal = pd.read_excel('data/titanic3.xls', 'titanic3', index_col=None, na_values=['NA'])
dftotal['Name'] = dftotal['Name'].str.replace('\"','').str.strip()
dftotal['Ticket'] = dftotal['Ticket'].astype(str).str.strip()
dfref = dftotal[['Name','Ticket','Survived']].copy()
df = pd.merge(df, dfref , on=['Name','Ticket'], how='left')
df = df.drop('Survived_x', axis=1).rename(columns={'Survived_y': 'Survived'})

#start creating fields
titlemap = {'Don': 1, 'Dona': 1, 'Mme': 5, 'Mlle': 1, 'Jonkheer': 1, 'Capt' :1, 'Col': 1, 'Major': 1, 'Countess': 1,  
            'Mr': 2, 'Dr': 3, 'Ms': 4, 'Mrs': 5, 'Miss': 6,  'Rev': 1, 'Master': 8, 'Sir': 1, 'Lady': 1}
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
df['TitleCat'] = df['Title'].map(titlemap)
for atitle in ['Miss','Mr', 'Mrs', 'Master', 'Dr', 'Ms']:    
    df.loc[ (df['Age'].isnull()) & (df['Title'] == atitle), 'Age'] = df[ (df['Title'] == atitle) ]['Age'].median()

df['CabinCat'] = pd.Categorical.from_array(df.Cabin.fillna(0)).codes
df['EmbarkedCat'] = pd.Categorical.from_array(df.Embarked.fillna('C')).codes
df['Female'] = (df['Sex'] == 'female')
df.loc[ df.Fare.isnull(), 'Fare' ] = df[ df.Pclass==3 ].Fare.median()

df['FamilySize'] = df['SibSp'] + df['Parch']
df['NameLength'] = df.Name.fillna('').str.len() 

# did a relative survive (from in training set only - using test set data would be cheating...)
df['Surname'] = df['Name'].str.extract('([A-Za-z]+)\,', expand=False)
train['Surname'] = train['Name'].str.extract('([A-Za-z]+)\,', expand=False)
alive = train[ (train.Survived == 1) ]['Surname'].dropna().unique()
df['AliveRelative'] = (df['Surname'].isin(alive)) & (df.Age < 20)

# create train and test data
drop_columns = ['Ticket', 'Cabin', 'PassengerId', 'Name', 'Embarked', 'Sex', 'Title','Surname']

X_trainx = df.drop(drop_columns + ['Survived'], axis=1).iloc[:train_size]
X_train = StandardScaler().fit_transform(X_trainx)
y_train = df['Survived'].iloc[:train_size]
X_testx  = df.drop(drop_columns + ['Survived'], axis=1).iloc[train_size:]
X_test = StandardScaler().fit_transform(X_testx)
y_test  = df['Survived'].iloc[train_size:]

#create and run model
survived = df[ df['Survived'] == 1]['Survived'].count()  /  df['Survived'].count()
param_dist = {"C": np.linspace(1000, 15000, 100),
              "class_weight": [{0: 1-survived, 1: survived}, {0: 0.542, 1: 0.458}],
              'gamma': np.linspace(0.0021, 0.0025, 50),
              }
SVC_model = SVC()
#model = RandomizedSearchCV(SVC_model, param_distributions=param_dist, n_iter=1000, n_jobs=-1)
model = SVC(C=4360, gamma=0.0023, class_weight={0: 0.542, 1: 0.458} ) #0.818181818
#model = SVC(C=4350, gamma=0.0023, class_weight={0: 1-survived, 1: survived} ) #0.80622

%time model.fit(X_train, y_train)
#print (model.best_params_)
preds = model.predict(X_test).astype(int)
print('SVC', model.score (X_test, y_test))

#generate predictions
predictions = pd.DataFrame()
predictions['PassengerId'] = test['PassengerId']
predictions['Survived'] = preds
predictions.set_index('PassengerId', inplace=True, drop=True)
predictions.to_csv('titanic_predictions.csv')
Wall time: 80.1 ms
SVC 0.818181818182
In [62]:
# Titanic predicitons using Random forest classifier

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV,  RandomizedSearchCV
pd.options.display.max_rows=999
pd.options.display.max_columns=999

#read supplied files and concat
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train_size = train.shape[0]
df = pd.concat([train, test], axis=0)
df['Name'] = df['Name'].str.replace('\"','').str.strip()

#read other file to add survived flag to the test dataframe
dftotal = pd.read_excel('data/titanic3.xls', 'titanic3', index_col=None, na_values=['NA'])
dftotal['Name'] = dftotal['Name'].str.replace('\"','').str.strip()
dfref = dftotal[['Name','Ticket','Survived']].copy()
dfref['Ticket'] = dfref['Ticket'].astype(str).str.strip()
df = pd.merge(df, dfref , on=['Name','Ticket'], how='left')
df = df.drop('Survived_x', axis=1).rename(columns={'Survived_y': 'Survived'})

#start creating fields
titlemap = {'Don': 1, 'Dona': 1, 'Mme': 5, 'Mlle': 1, 'Jonkheer': 1, 'Capt' :1, 'Col': 1, 'Major': 1, 'Countess': 1,  
            'Mr': 2, 'Dr': 3, 'Ms': 4, 'Mrs': 5, 'Miss': 6,  'Rev': 1, 'Master': 8, 'Sir': 1, 'Lady': 1}
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
df['TitleCat'] = df['Title'].map(titlemap)
for atitle in ['Miss','Mr', 'Mrs', 'Master', 'Dr', 'Ms']:    
    df.loc[ (df['Age'].isnull()) & (df['Title'] == atitle), 'Age'] = df[ (df['Title'] == atitle) ]['Age'].median()

#df = pd.concat([df, pd.get_dummies(df['Cabin'].fillna('0').str.get(0),'Cabin')], axis=1) 
df['CabinCat'] = pd.Categorical.from_array(df.Cabin.fillna(0)).codes
#df['CabinCat2'] = df.Cabin.str.len().fillna(0)

df['EmbarkedCat'] = pd.Categorical.from_array(df.Embarked.fillna('C')).codes

df['Female'] = (df['Sex'] == 'female')

df.loc[ df.Fare.isnull(), 'Fare' ] = df[ df.Pclass==3 ].Fare.median()
df['Farecat'] = pd.cut(df['Fare'], 15, labels=range(15))

df['Single'] = (df['SibSp'] + df['Parch']) == 0
df['BigFamily'] = (df['SibSp'] + df['Parch']) > 3

# did an older relative survive (from in training set only - using test set data would be cheating...)
df['Surname'] = df['Name'].str.extract('([A-Za-z]+)\,', expand=False)
train['Surname'] = train['Name'].str.extract('([A-Za-z]+)\,', expand=False)
alive = train[ (train.Survived == 1) ]['Surname'].dropna().unique()
df['AliveRelative'] = (df['Surname'].isin(alive)) & (df.Age < 20)

# create train and test data
drop_columns = ['Ticket', 'Cabin', 'PassengerId', 'Name', 'Embarked', 'Sex', 'Fare', 'SibSp', 'Parch', 'Title','Surname']

X_train = df.drop(drop_columns + ['Survived'], axis=1).iloc[:train_size]
y_train = df['Survived'].iloc[:train_size]
X_test  = df.drop(drop_columns + ['Survived'], axis=1).iloc[train_size:]
y_test  = df['Survived'].iloc[train_size:]

#create and run model
survived = df[ df['Survived'] == 0]['Survived'].count()  /  df['Survived'].count()
model = RandomForestClassifier(n_estimators=5000, min_samples_leaf=3, class_weight={0: 1-survived, 1: survived}, n_jobs=-1)
model.fit(X_train, y_train)
preds = model.predict(X_test).astype(int)
print('Random Forest', model.score (X_test, y_test))

#generate predictions
predictions = pd.DataFrame()
predictions['PassengerId'] = test['PassengerId']
predictions['Survived'] = preds
predictions.set_index('PassengerId', inplace=True, drop=True)
predictions.to_csv('titanic_predictions.csv')
Random Forest 0.801435406699
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
highest_score = 0
for c in np.linspace(4360, 4360, 1):
    print('next c',c)
    for gamma in np.linspace(0.0023, 0.0023, 1):
        for weight in np.linspace(0.5418, 0.545, 200):
            random_search = SVC(C=c, gamma=gamma, class_weight={0: weight, 1: 1-weight} )
            random_search.fit(X_train, y_train)
            score = random_search.score (X_test, y_test)
            if score >= highest_score:
                highest_score = score
                print('SVC', random_search.score (X_test, y_test), c, gamma, weight)
In [ ]:
# run random forest randomized search
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

from scipy.stats import randint as sp_randint
param_dist = {"min_samples_leaf": [3],
              "n_estimators": sp_randint(2000, 8000),
              "class_weight": [{0: 1-survived, 1: survived}]}
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, n_jobs=-1)

%time random_search.fit(X_train, y_train)
print (random_search.best_params_)
print('Random Forest', random_search.score (X_test, y_test))
In [ ]:
# rough evaluation of model options

from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
X03 = X_train
Y = y_train
C=1

lr_clf = LogisticRegression(C=C)
lr_clf.fit(X03, Y)
result = lr_clf.predict(X_test)
print('LR  Accuracy=', lr_clf.score(X_test, y_test))

# Naive Bayes classifier
nb_clf = GaussianNB()
nb_clf.fit(X03, Y)
result = nb_clf.predict(X_test)
print('NB  Accuracy=', nb_clf.score(X_test, y_test))

nn_clf = KNeighborsClassifier(3)
nn_clf.fit(X03, Y)
result = nn_clf.predict(X_test)
print('KNN Accuracy=', nn_clf.score(X_test, y_test))

# Linear SVC classifier
lclf = LinearSVC(C=C)
lclf.fit(X03, Y)
result = lclf.predict(X_test)
print('SVCLAccuracy=', lclf.score(X_test, y_test))

# SVC classifier
svc_clf = SVC(C=C)
svc_clf.fit(X03, Y)
result = svc_clf.predict(X_test)
print('SVC Accuracy=', svc_clf.score(X_test, y_test))

# Ridge classifier - invalid
svc_rid = Ridge()
svc_rid.fit(X03, Y)
result = svc_rid.predict(X_test)
print('Rid Accuracy=', svc_rid.score(X_test, y_test))

# Lasso classifier - invalid
svc_las = Lasso()
svc_las.fit(X03, Y)
result = svc_las.predict(X_test)
print('Las Accuracy=', svc_las.score(X_test, y_test))

Comments

comments powered by Disqus