项目作者: Sudev18

项目描述 :
Credit Card Fraud Detection algorithm using smote , confusion matrix, correlation matrix, density plots and ROC-AUC curve . Model - Logistic Regression, Knn, Isolation Forest
高级语言:
项目地址: git://github.com/Sudev18/credit-card.git
创建时间: 2020-12-11T00:16:31Z
项目社区:https://github.com/Sudev18/credit-card

开源协议:

下载


credit-card

Credit Card Fraud Detection algorithm using smote , confusion matrix, correlation matrix, density plots and ROC-AUC curve

!/usr/bin/env python3

-- coding: utf-8 --

!/usr/bin/env python3

-- coding: utf-8 --

!/usr/bin/env python3

-- coding: utf-8 --

“””
Created on Thu Dec 10 17:02:55 2020

@author: sudevpradhan
CREDIT CARD FRAUD DETECTION
Data sets from https://www.kaggle.com/mlg-ulb/creditcardfraud
“””

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import plot_confusion_matrix
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, accuracy_score, precision_recall_curve, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest

from xgboost import XGBClassifier

Other Libraries

from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, StratifiedKFold, train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import StratifiedShuffleSplit as sss
import warnings
warnings.filterwarnings(“ignore”)

Load the Dataset

data = pd.read_csv(‘creditcard.csv’)

df = pd.read_csv(‘creditcard.csv’)
print(df.shape)
df.head()

df.info()

df.describe()

class_names = {0:’Not Fraud’, 1:’Fraud’}
print(df.Class.value_counts().rename(index = class_names))

fig = plt.figure(figsize = (15, 12))

def imbalance():

  1. #Print the value counts of frauds and non-frauds in the data
  2. print(data['Class'].value_counts())
  3. #Calculate the percentage of Fraud and Non-fraud transactions.
  4. print('Valid Transactions: ', round(data['Class'].value_counts()[0]/len(data) * 100,2)
  5. , '% of the dataset')
  6. print('Fraudulent Transactions: ', round(data['Class'].value_counts()[1]/len(data) * 100,2)
  7. , '% of the dataset')
  8. #Visualizing the class Imbalance
  9. colors = ['blue','red']
  10. sns.countplot('Class', data=data, palette=colors)

def data_visualisation():

  1. #Distribution of the amount and time in the data sets
  2. fig, ax = plt.subplots(1, 2, figsize=(16,4))
  3. fig.suptitle('Distribution', fontsize=16)
  4. amount_val = df['Amount'].values
  5. time_val = df['Time'].values
  6. colors = ["#0101DF", "#DF0101"]
  7. sns.distplot(amount_val, ax=ax[0], color=colors[0])
  8. ax[0].set_title('Distribution of Transaction Amount')
  9. ax[0].set_xlim([min(amount_val), max(amount_val)])
  10. sns.distplot(time_val, ax=ax[1], color=colors[1])
  11. ax[1].set_title('Distribution of Transaction Time')
  12. ax[1].set_xlim([min(time_val), max(time_val)])
  13. plt.show()
  14. #fraud amount and non fraud amount
  15. fig, ax = plt.subplots(1, 2, figsize=(16,4), sharex=True)
  16. fig.suptitle('Amount/transaction', fontsize=16)
  17. colors = ["#0101DF", "#DF0101"]
  18. sns.distplot(df[df['Class']==1].Amount, ax=ax[0], color=colors[0])
  19. plt.xlabel('Amount')
  20. plt.ylabel('Number of Transactions')
  21. ax[0].set_title('Distribution of Transaction Amount (Fraud)')
  22. sns.distplot(df[df['Class']==0].Amount, ax=ax[1], color=colors[1])
  23. ax[1].set_title('Distribution of Transaction Amount (Valid)')
  24. plt.xlabel('Amount')
  25. plt.ylabel('Number of Transactions')
  26. plt.xlim((0, 20000))
  27. plt.yscale('log')
  28. plt.show()
  29. # scatter plot of the fraudulent and non fraudulent data against time
  30. fig, ax = plt.subplots(1, 2, figsize=(16,4), sharex=True)
  31. fig.suptitle('Time of transaction vs Amount', fontsize=16)
  32. colors = ["#0101DF", "#DF0101"]
  33. ax[0].scatter(df[df['Class']==1].Time, df[df['Class']==1].Amount)
  34. ax[0].set_title('Fraud')
  35. plt.xlabel('Time')
  36. plt.ylabel('Amount')
  37. ax[1].scatter(df[df['Class']==0].Time, df[df['Class']==0].Amount)
  38. ax[1].set_title('Valid')
  39. plt.xlabel('Time')
  40. plt.ylabel('Amount')
  41. plt.show()

def plotCorrelationMatrix():
features = df.columns.values

  1. correlation_matrix = df.corr()
  2. fig = plt.figure(figsize=(12,8))
  3. fig.suptitle('Correlation Plot', fontsize=16)
  4. sns.heatmap(correlation_matrix,vmax=0.8,square = True)
  5. plt.show()
  6. correlations = df[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
  7. correlations = correlations[correlations['level_0'] != correlations['level_1']]
  8. print("Top 6 correlated features")
  9. print(correlations.head(5)) #
  10. print("\n Least 6 correlated features")
  11. print(correlations.tail(5))

def preprocessing(X,y):

  1. #Splitting the Data
  2. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
  3. print("Transactions in X_train dataset: ", X_train.shape)
  4. print("Transaction classes in y_train dataset: ", y_train.shape)
  5. print("Transactions in X_test dataset: ", X_test.shape)
  6. print("Transaction classes in y_test dataset: ", y_test.shape)
  7. #Feature Scaling
  8. scaler_amount = StandardScaler()
  9. scaler_time = StandardScaler()
  10. # normalising
  11. X_train['normAmount'] = scaler_amount.fit_transform(X_train['Amount'].values.reshape(-1, 1))
  12. X_test['normAmount'] = scaler_amount .transform(X_test['Amount'].values.reshape(-1, 1))
  13. X_train['normTime'] = scaler_time .fit_transform(X_train['Time'].values.reshape(-1, 1))
  14. X_test['normTime'] = scaler_time .transform(X_test['Time'].values.reshape(-1, 1))
  15. X_train = X_train.drop(['Time', 'Amount'], axis=1)
  16. X_test = X_test.drop(['Time', 'Amount'], axis=1)
  17. X_train.head()
  18. return X_train, X_test, y_train, y_test

def distribution():

  1. raw_df= data
  2. cleaned_df = raw_df.copy()
  3. # You don't want the `Time` column.
  4. cleaned_df.pop('Time')
  5. # The `Amount` column covers a huge range. Convert to log-space.
  6. eps=0.001 # 0 => 0.1¢
  7. cleaned_df['Log Ammount'] = np.log(cleaned_df.pop('Amount')+eps)
  8. # Use a utility from sklearn to split and shuffle our dataset.
  9. train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
  10. train_df, val_df = train_test_split(train_df, test_size=0.2)
  11. # Form np arrays of labels and features.
  12. train_labels = np.array(train_df.pop('Class'))
  13. bool_train_labels = train_labels != 0
  14. val_labels = np.array(val_df.pop('Class'))
  15. test_labels = np.array(test_df.pop('Class'))
  16. train_features = np.array(train_df)
  17. val_features = np.array(val_df)
  18. test_features = np.array(test_df)
  19. scaler = StandardScaler()
  20. train_features = scaler.fit_transform(train_features)
  21. val_features = scaler.transform(val_features)
  22. test_features = scaler.transform(test_features)
  23. train_features = np.clip(train_features, -5, 5)
  24. val_features = np.clip(val_features, -5, 5)
  25. test_features = np.clip(test_features, -5, 5)
  26. print('Training labels shape:', train_labels.shape)
  27. print('Validation labels shape:', val_labels.shape)
  28. print('Test labels shape:', test_labels.shape)
  29. print('Training features shape:', train_features.shape)
  30. print('Validation features shape:', val_features.shape)
  31. print('Test features shape:', test_features.shape)
  32. pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
  33. neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)
  34. sns.jointplot(pos_df['V5'], pos_df['V6'],
  35. kind='hex', xlim = (-5,5), ylim = (-5,5))
  36. plt.suptitle("Positive distribution")
  37. sns.jointplot(neg_df['V5'], neg_df['V6'],
  38. kind='hex', xlim = (-5,5), ylim = (-5,5))
  39. _ = plt.suptitle("Negative distribution")

def using_smote(X_train, X_test, y_train, y_test):

  1. print("Before over-sampling:\n", y_train['Class'].value_counts())
  2. sm = SMOTE()
  3. X_train_res, y_train_res = sm.fit_sample(X_train, y_train['Class'])
  4. print("After over-sampling:\n", y_train_res.value_counts())
  5. #Build the Model
  6. parameters = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
  7. lr = LogisticRegression()
  8. clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
  9. k = clf.fit(X_train_res, y_train_res)
  10. print(k.best_params_)
  11. #Evaluate the Model
  12. lr_gridcv_best = clf.best_estimator_
  13. y_test_pre = lr_gridcv_best.predict(X_test)
  14. cnf_matrix_test = confusion_matrix(y_test, y_test_pre)
  15. print("Recall metric in the test dataset:", (cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1] )))
  16. y_train_pre = lr_gridcv_best.predict(X_train_res)
  17. cnf_matrix_train = confusion_matrix(y_train_res, y_train_pre)
  18. print("Recall metric in the train dataset:", (cnf_matrix_train[1,1]/(cnf_matrix_train[1,0]+cnf_matrix_train[1,1] )))
  19. print(classification_report(y_test,y_test_pre))
  20. return k,X_test, y_test, X_train_res, y_train_res

def confusion(k,X_test, y_test, X_train_res, y_train_res):

  1. #Visualize the Confusion Matrix
  2. plt.style.use('seaborn')
  3. class_names = ['Not Fraud', 'Fraud']
  4. plot_confusion_matrix(k, X_test, y_test, values_format = '.5g', display_labels=class_names)
  5. plt.title("Test data Confusion Matrix")
  6. plt.show()
  7. plot_confusion_matrix(k, X_train_res, y_train_res, values_format = '.5g', display_labels=class_names)
  8. plt.title("Oversampled Train data Confusion Matrix")
  9. plt.show()

def ROC(X_test,y_test):
y_k = k.decision_function(X_test)

  1. fpr, tpr, thresholds = roc_curve(y_test, y_k)
  2. roc_auc = auc(fpr, tpr)
  3. print("ROC-AUC:", roc_auc)
  4. # Now visualize the roc_auc curve.
  5. plt.style.use('seaborn')
  6. plt.title('Receiver Operating Characteristic')
  7. plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
  8. plt.legend(loc='lower right')
  9. plt.plot([0,1],[0,1],'r--')
  10. plt.xlim([-0.1,1.0])
  11. plt.ylim([-0.1,1.01])
  12. plt.ylabel('True Positive Rate')
  13. plt.xlabel('False Positive Rate')
  14. plt.show()

def plotScatterMatrix(df, plotSize, textSize):
df = df.select_dtypes(include =[np.number]) # keep only numerical columns

  1. # Remove rows and columns that would lead to df being singular
  2. df = df.dropna('columns')
  3. df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
  4. columnNames = list(df)
  5. if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
  6. columnNames = columnNames[:10]
  7. df = df[columnNames]
  8. ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
  9. corrs = df.corr().values
  10. for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
  11. ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
  12. plt.suptitle('Scatter and Density Plot')
  13. plt.show()

def regression():

  1. log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.01, 0.1, 1]}
  2. grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
  3. grid_log_reg.fit(X_train, y_train)
  4. log_reg = grid_log_reg.best_estimator_
  5. log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
  6. print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')
  7. from sklearn.model_selection import ShuffleSplit
  8. from sklearn.model_selection import learning_curve
  9. '''def plot_learning_curve(estimator1, X, y, ylim=None, cv=None,
  10. n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
  11. f, ((ax1)) = plt.subplots(1,1, figsize=(16,8), sharey=True)
  12. if ylim is not None:
  13. plt.ylim(*ylim)
  14. train_sizes, train_scores, test_scores = learning_curve(
  15. estimator1, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
  16. train_scores_mean = np.mean(train_scores, axis=1)
  17. train_scores_std = np.std(train_scores, axis=1)
  18. test_scores_mean = np.mean(test_scores, axis=1)
  19. test_scores_std = np.std(test_scores, axis=1)
  20. ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
  21. train_scores_mean + train_scores_std, alpha=0.1,
  22. color="#ff9124")
  23. ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
  24. test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
  25. ax1.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
  26. label="Training score")
  27. ax1.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
  28. label="Cross-validation score")
  29. ax1.set_title("Logistic Regression Learning Curve", fontsize=14)
  30. ax1.set_xlabel('Training size (m)')
  31. ax1.set_ylabel('Score')
  32. ax1.grid(True)
  33. ax1.legend(loc="best")
  34. return plt'''
  35. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=21)
  36. #plot_learning_curve(log_reg, X_train, y_train, (0.87, 1.01), cv=cv, n_jobs=4)
  37. log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
  38. method="decision_function")
  39. print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))

def Isolation_forest_algorithm():

  1. classifiers = {
  2. "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), verbose=0)}
  3. for i, (clf_name,clf) in enumerate(classifiers.items()):
  4. clf.fit(X_train)
  5. scores_prediction = clf.decision_function(X_train)
  6. y_pred = clf.predict(X_train)
  7. y_pred[y_pred == 1] = 0
  8. y_pred[y_pred == -1] = 1
  9. n_errors = (y_pred != y_train).sum()
  10. print("{}: {}".format(clf_name,n_errors))
  11. print("Accuracy Score :")
  12. print(accuracy_score(y_train,y_pred))
  13. print("Classification Report :")
  14. print(classification_report(y_train,y_pred))
  15. print('\n')

def knn():
knn = KNeighborsClassifier(n_neighbors = 5,n_jobs=16)
knn.fit(X_train,y_train)
k1=knn.fit(X_train,y_train)
print(“”)
print(“knn classifier created”)
score = knn.score(X_test,y_test)
print(“knn model score-“)
print(score)
pred = knn.predict(X_test)
print(classification_report(y_test,pred))
matrix=confusion_matrix(y_test,pred)
print(matrix)
plt.figure(figsize = (10,7))
sns.heatmap(matrix,annot=True)
prob=knn.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, prob[:,1])
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle=’—‘)
plt.plot(fpr, tpr, marker=’.’)
plt.annotate(‘Minimum ROC Score of 50% \n (This is the minimum score to get)’, xy=(0.5, 0.5), xytext=(0.6, 0.3),
arrowprops=dict(facecolor=’#6E726D’, shrink=0.05),
)
plt.title(“ROC curve”)
plt.xlabel(‘false positive rate’)
plt.ylabel(‘true positive rate’)
plt.show()

MAIN FUNCTION

Exploring the Class Column

X = data.loc[:, data.columns != ‘Class’]
y = data.loc[:, data.columns == ‘Class’]

nRowsRead = 1000
df1 = pd.read_csv(‘creditcard.csv’, delimiter=’,’, nrows = nRowsRead)
df1.dataframeName = ‘creditcard.csv’
nRow, nCol = df1.shape
print(f’There are {nRow} rows and {nCol} columns’)

FUNCTION CALL

imbalance()
X_train, X_test, y_train, y_test=preprocessing(X,y)
k,X_test, y_test, X_train_res, y_train_res=using_smote(X_train, X_test, y_train, y_test)
data_visualisation()
distribution()
plotCorrelationMatrix()
plotScatterMatrix(df1, 20, 10)

confusion(k,X_test, y_test, X_train_res, y_train_res)
ROC(X_test, y_test)
knn()
X_train, X_test, y_train, y_test=preprocessing(X,y)
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
X_train = X_train.values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = X_test.values
y_train = y_train.replace([np.inf, -np.inf], np.nan).fillna(0)
y_train = y_train.values
y_test = y_test.replace([np.inf, -np.inf], np.nan).fillna(0)
y_test = y_test.values
Isolation_forest_algorithm()

activate this code separetly, for the regression plot

‘’’regression()’’’