Training a neural network on the Titanic dataset.
The dataset can be downloaded from https://www.kaggle.com/c/titanic. Since nnlib
does not support feature engineering, the dataset needs to be prepared using the following Python script:
4 from sklearn.preprocessing
import MinMaxScaler
8 scaler = MinMaxScaler()
9 transformed = scaler.fit_transform(X[:, column].reshape(-1, 1))
10 X[:, column] = transformed[:, 0]
15 print(
'Filepath to the train file needs to be specified')
19 print(f
'Processing {filepath}')
21 dataset = pd.read_csv(filepath)
22 dataset[
'Age'].fillna(dataset[
'Age'].mean(), inplace=
True)
23 dataset = pd.get_dummies(dataset, prefix=[
'Sex'], columns=[
'Sex'], drop_first=
True)
24 dataset = dataset.drop([
'PassengerId',
'Pclass',
'Name',
'Ticket',
'Cabin',
'Embarked'], axis=1)
27 X = dataset.iloc[:, 1:].to_numpy()
28 y = dataset.iloc[:, 0].to_numpy()
40 dfX.to_csv(
'./out/dataset.csv', header=
None, index=
False)
43 if __name__ ==
'__main__':
The Python script expects one parameter, which is the full path to the Titanic dataset. The dataset prepared for nnlib
will then be generated in the ./out
directory.
The main.cpp
file expects one argument which is the absolute path to the file prepared by the Python script.
#include <iostream>
#include <nnlib/network.h>
#include <nnlib/read.h>
#include <nnlib/verify.cuh>
#include <nnlib/onehot_encode.h>
#include <chrono>
int main(int argc, char** argv) {
if (argc < 2) {
std::cout << "Dataset file path was not specified." << std::endl;
return 1;
}
sTensor dataset =
readCSV(argv[1],
",", 4);
sTensor X = std::make_shared<Tensor>(dataset->shape[0], dataset->shape[1] - 1);
sTensor y = std::make_shared<Tensor>(dataset->shape[0], 1);
for (int i = 0; i < dataset->shape[0]; i++) {
y->data[i] = dataset->data[i * dataset->shape[1] + dataset->shape[1] - 1];
for (int j = 0; j < dataset->shape[1] - 1; j++) {
X->data[i * X->shape[1] + j] = dataset->data[i * dataset->shape[1] + j];
}
}
std::cout << y << std::endl;
network.
add(y->shape[1],
"sigmoid");
return 0;
}
The implementation of binary accuracy.
Definition: metric.h:97
Class representing the Binary Cross Entropy.
Definition: loss.h:60
Class representing the Mean Squared Error.
Definition: loss.h:48
Represents a neural network.
Definition: network.h:23
void train(sTensor &X, sTensor &y, int epochs, size_t batchSize, float learningRate, Loss *loss, std::vector< Metric * > &metrics)
Train the network.
Definition: network.cpp:198
void add(size_t numNeurons, const std::string &activation="linear")
Add a new layer to the network.
Definition: network.cpp:178
sTensor readCSV(const std::string &filepath, const std::string &delim, int numThreads)
Read a csv file from a path.
Definition: read.cpp:107
void showCudaInfo()
Show information about CUDA and the available GPU(s).
Definition: verify.cu:17
The project can be built with the following CMake script. This script requires CMAKE_PREFIX_PATH
to be set to the install directory of nnlib
.
cmake_minimum_required(VERSION 3.20)
include(CheckLanguage)
project(titanic_nnlib LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 14)
find_package(nnlib CONFIG REQUIRED)
check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
set(CMAKE_CUDA_STANDARD 14)
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
#
Add flag to specify the architecture of the GPU (compute capability)
if (NOT DEFINED CUDA_ARCHITECTURES)
set_target_properties(nnlib PROPERTIES CUDA_ARCHITECTURES "50")
endif()
set_target_properties(nnlib PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON)
endif()
add_executable(${PROJECT_NAME} main.cpp)
target_link_libraries(${PROJECT_NAME} PRIVATE nnlib)
Definition: functions.h:68