INT104 coursework 2 (student id: 1931391)¶

In [1]:
# Install all related packages which will be Used

import time
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, cross_val_score

Step 1: dimensionality reduction (PCA algorithm)¶

In [9]:
def pca_and_visualize(input_file, output_file, unit_index, label):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    """

    # Read the raw data.csv file
    df = pd.read_csv(input_file)

    # Delete the Label=2 value in dataframe
    df = df[df['Label'] != 2]

    # Separate the data frame of Patient index and multiple feature columns
    df_features = df.drop([label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Perform PCA dimensionality reduction on the feature columns
    pca = PCA(n_components=10)
    pca_result = pca.fit_transform(df_features.iloc[:, 1:].values)

    # Build a new data frame
    df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
    df_pca.insert(0, unit_index, df_features[unit_index])

    # Align and merge the second data frame with the new data frame obtained in step three using a common Patient index
    df_new = pd.merge(df_pca, df_labels, on=unit_index)

    # Save PCA result to CSV file
    df_new.to_csv(output_file, index=False)

    # Draw scatter plot (PC1, PC2, PC3)
    fig = plt.figure(figsize=(10, 10))  # Set the size of the Figure
    ax = fig.add_subplot(projection='3d')
    groups = df_new.groupby(label)
    for name, group in groups:
        ax.scatter(group['PC1'], group['PC2'], group['PC3'], label=name)
    ax.legend()
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    plt.title('PCA')
    plt.subplots_adjust(left=0.1)  # Adjust left margin whitespace
    plt.show()
    fig.savefig('PCA10_scatter_plot.png')

    # Draw scatter matrix
    scatter_matrix(df_new.iloc[:, :-1], c=df_new[label], figsize=(20, 20), marker='o')
    plt.show()
    fig.savefig('PCA10_scatter_matrix.png')

    # Draw heatmap
    plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    sns.heatmap(df_new.corr(), annot=True, ax=ax)
    plt.show()
    fig.savefig('PCA10_heatmap.png')

    # Test model performance
    print('Variance contribution rates (10):',
          sum(pca.explained_variance_ratio_))

    # Additionally, calculate variance contribution rates of the first 2, 3, 6 principal components
    pca_2 = PCA(n_components=2)
    pca_3 = PCA(n_components=3)
    pca_6 = PCA(n_components=6)
    pca_2.fit(df_features.iloc[:, 1:].values)
    pca_3.fit(df_features.iloc[:, 1:].values)
    pca_6.fit(df_features.iloc[:, 1:].values)
    print('Variance contribution rates (2):',
          sum(pca_2.explained_variance_ratio_))
    print('Variance contribution rates (3):',
          sum(pca_3.explained_variance_ratio_))
    print('Variance contribution rates (6):',
          sum(pca_6.explained_variance_ratio_))

    # Fit PCA model with cross-validation
    print('Cross validate scores:',
          cross_validate(pca, df, cv=10))


if __name__ == '__main__':
    start_time = time.time()
    pca_and_visualize('Data.csv', 'PCA10_data.csv', 'Patient index', 'Label')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
Variance contribution rates (10): 0.8294108531299192
Variance contribution rates (2): 0.32479715122708375
Variance contribution rates (3): 0.41037516493120607
Variance contribution rates (6): 0.611955410437872
Cross validate scores: {'fit_time': array([0.00897574, 0.00897503, 0.00797915, 0.0079782 , 0.00798011,
       0.00897431, 0.00897479, 0.00797939, 0.0079782 , 0.00797892]), 'score_time': array([0.00199556, 0.00099754, 0.0009973 , 0.00099778, 0.00099683,
       0.00099897, 0.0009973 , 0.00099707, 0.00199604, 0.00099707]), 'test_score': array([-19.9893636 , -18.25827548, -17.671087  , -17.41411827,
       -17.77187038, -18.07353759, -18.5401708 , -18.59057024,
       -18.80471495, -19.59787247])}
Execution time: 9.64s

Step 2 [1]: training classifiers in a supervised way (K-Nearest Neighbors algorithm)¶

In [10]:
def knn_classification(input_file, output_file, unit_index, label, k_neighbors):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    :param k_neighbors: int, k neighbors nearby
    """

    # Read the input file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit KNN model
    knn = KNeighborsClassifier(n_neighbors=k_neighbors)
    knn.fit(df_features, df_labels[label])

    # Predict labels
    predicted_labels = knn.predict(df_features)
    df_labels['Predicted Label'] = predicted_labels

    # Export the new data frame with predicted labels as output file
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw KNN scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('KNN Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of KNN classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit KNN model with cross-validation
    scores = cross_validate(knn, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == '__main__':
    start_time = time.time()
    knn_classification('PCA10_data.csv', 'new_knn_data.csv', 'Patient index', 'Label', 47)
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\1958426232.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = predicted_labels
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6243416102332581
Precision: 0.6045510455104551
Recall: 0.4206247325631151
F1 score: 0.49608882159979817
Cross validate scores: {'fit_time': array([0.00997424, 0.01196909, 0.00897527, 0.00897622, 0.00897574,
       0.00897527, 0.00897551, 0.00897598, 0.01097059, 0.00997329]), 'score_time': array([0.06482625, 0.05385661, 0.05526328, 0.05485344, 0.05186129,
       0.05385709, 0.05585003, 0.0548532 , 0.05086446, 0.05485249]), 'test_score': array([0.54887218, 0.57330827, 0.54135338, 0.58458647, 0.54699248,
       0.67481203, 0.68926554, 0.63653484, 0.48964218, 0.47269303])}
Execution time: 1.62s

Step 2 [2]: training classifiers in a supervised way (Support Vector Machine algorithm)¶

In [11]:
def svm_classification(input_file, output_file, unit_index, label, C=1.0, kernel='linear', gamma='scale'):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    :param C: float, degree of punishment for controlling classification errors
    :param kernel: str, kernel function which can improve the accuracy and generalization ability of the classifier
    :param gamma: {'scale', 'auto'} or float, affects the Gaussian kernel function
    """

    # Read input data file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit SVM model
    svm = SVC(C=C, kernel=kernel, gamma=gamma)
    svm.fit(df_features, df_labels[label])

    # Predict labels
    df_labels['Predicted Label'] = svm.predict(df_features)

    # Export the new data frame with predicted labels as output file
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw SVM scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('SVM Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of SVM classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit SVM model with cross-validation
    scores = cross_validate(svm, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == "__main__":
    start_time = time.time()
    svm_classification('PCA10_data.csv', 'new_svm_data.csv', 'Patient index', 'Label', C=1.0, kernel='linear',
                       gamma='scale')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\1008353979.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = svm.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6059066967644846
Precision: 0.5735115431348724
Recall: 0.40393667094565683
F1 score: 0.4740145618880241
Cross validate scores: {'fit_time': array([0.71421552, 0.67376304, 0.75099015, 0.68760085, 0.68915629,
       0.68284726, 0.55757976, 0.66826916, 0.70177937, 0.69170809]), 'score_time': array([0.0339098 , 0.03590274, 0.03191471, 0.03291202, 0.03291225,
       0.03391051, 0.03586912, 0.03391004, 0.03287673, 0.03490663]), 'test_score': array([0.54699248, 0.55451128, 0.53007519, 0.58458647, 0.54135338,
       0.72932331, 0.56120527, 0.72693032, 0.47457627, 0.46892655])}
Execution time: 8.76s

Step 2 [3]: training classifiers in a supervised way (Decision Tree algorithm)¶

In [12]:
def dt_classification(input_file, output_file, unit_index, label, max_depth, min_samples_leaf,
                      min_samples_split,
                      max_features):
    """
        :param input_file: str, input data file path
        :param output_file: str, output file path
        :param unit_index: str, column name of unit index
        :param label: str, column name of label
        :param max_depth, int, maximum depth of the tree
        :param min_samples_leaf, int, minimum number of samples for a leaf node
        :param min_samples_split, int, minimum number of samples with internal nodes
        :param max_features, int, maximum number of features considered at each node split
    """

    # Read input data file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit DT model
    dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split, max_features=max_features)
    dt.fit(df_features, df_labels[label])

    # Predict labels
    df_labels['Predicted Label'] = dt.predict(df_features)

    # Export the new data frame with predicted labels
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw DT scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('DT Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of DT classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit Decision Tree model with cross-validation
    scores = cross_validate(dt, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == "__main__":
    start_time = time.time()
    dt_classification('PCA10_data.csv', 'new_dt_data.csv', 'Patient index', 'Label', 5, 5, 10, 'sqrt')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_30896\3175021458.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = dt.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6202031602708804
Precision: 0.5754985754985755
Recall: 0.5186136071887034
F1 score: 0.5455773126266036
Cross validate scores: {'fit_time': array([0.00698066, 0.00698066, 0.00598407, 0.00706553, 0.00797963,
       0.00897741, 0.0089767 , 0.00797939, 0.0079782 , 0.00897646]), 'score_time': array([0.00199509, 0.00299191, 0.00199485, 0.00199413, 0.00199413,
       0.00299191, 0.00099564, 0.00199461, 0.00099802, 0.00199461]), 'test_score': array([0.53383459, 0.54511278, 0.54511278, 0.58646617, 0.53947368,
       0.69924812, 0.61581921, 0.66101695, 0.48399247, 0.47457627])}
Execution time: 0.50s

Test the performance between each classifier¶

In [13]:
# Read the input file
df = pd.read_csv('PCA10_data.csv')

# Define the range of k values to test
n_range = range(1, 101)

# Separate the data frame of Patient index, PCs columns
df_features = df.drop(['Patient index', 'Label'], axis=1)

# Separate the data frame with Patient index and Label
df_labels = df[['Patient index', 'Label']]

# Initialize the list to store the mean accuracy scores for each n value
knn_mean_scores = []
dt_mean_scores = []
svm_mean_scores = []

# Calculate the mean accuracy scores for each n value
for n in n_range:
    knn = KNeighborsClassifier(n_neighbors=n)
    svm = SVC(C=n)
    dt = DecisionTreeClassifier(max_depth=n)
    knn_scores = cross_val_score(knn, df_features, df_labels['Label'], cv=10, scoring='accuracy')
    svm_scores = cross_val_score(svm, df_features, df_labels['Label'], cv=10, scoring='accuracy')
    dt_scores = cross_val_score(dt, df_features, df_labels['Label'], cv=10, scoring='accuracy')
    knn_mean_scores.append(knn_scores.mean())
    svm_mean_scores.append(svm_scores.mean())
    dt_mean_scores.append(dt_scores.mean())

# Plot the mean accuracy scores against each k value
plt.figure(figsize=(16, 8))
plt.plot(n_range, knn_mean_scores, label='KNN')
plt.plot(n_range, svm_mean_scores, label='SVM')
plt.plot(n_range, dt_mean_scores, label='Decision Tree')
plt.xlabel('Value of parameters (k, C, max_depth)')
plt.ylabel('Cross-validated accuracy')
plt.legend()
plt.show()

Step 3: unsupervised classification¶

In [2]:
def knn_clustering(input_file, output_file, unit_index, k_clusters):
    # Read the input file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index], axis=1)

    # Fit KMeans model
    kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(df_features)

    # Predict labels
    predicted_labels = kmeans.labels_

    # Add classifications to the original dataframe
    df['Classification'] = predicted_labels

    # Export the new data frame with classifications as output file
    df.to_csv(output_file, index=False,
              columns=['Patient index', 'Classification', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8',
                       'PC9', 'PC10'])

    # Draw K-Means scatter plot with classifications
    plt.figure(figsize=(12, 9))
    colors = ['r', 'g', 'b', 'c', 'm']
    for i in range(k_clusters):
        cluster = df[df['Classification'] == i]
        plt.scatter(cluster['PC1'], cluster['PC2'], color=colors[i], label='Type {}'.format(i + 1))
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('K-Means Clustering')
    plt.legend()
    plt.savefig('kmeans_plot.png')
    plt.show()


if __name__ == '__main__':
    start_time = time.time()
    knn_clustering('PCA10_data.csv', 'new_kmeans_data.csv', 'Patient index', 5)
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
Execution time: 0.89s

The end of this analytics¶