项目作者: venkateshavula

项目描述 :
A UDF to evaluate Spark-MLlib classification model using PySpark
高级语言: Python
项目地址: git://github.com/venkateshavula/Evaluate-Spark-MLlib-using-PySpark.git


Evaluate-Spark-MLlib-using-PySpark

A UDF to evaluate Spark-MLlib classification model using PySpark

Compute Cohen’s kappa coefficient
  1. def kappa(tp, tn, fp, fn):
  2. N = tp+tn+fp+fn
  3. # Probability observed
  4. Po = float(tp+tn)/N
  5. # Probability expected
  6. Pe = float(((tn+fp)*(tn+fn))+((fn+tp)*(fp+tp)))/(N*N)
  7. # Cohen's kappa Coefficient
  8. kappa = float(Po-Pe)/(1-Pe)
  9. return(kappa)
Evaluate the classifier
  1. def evaluate(predictions):
  2. from pyspark.ml.evaluation import BinaryClassificationEvaluator
  3. import math
  4. print "Spam=",predictions[(predictions.label == 1)].count(),";No-Spam",predictions[(predictions.label == 0)].count()
  5. eval = BinaryClassificationEvaluator()
  6. # Area under the ROC curve
  7. auroc = eval.evaluate(predictions,{eval.metricName:"areaUnderROC"})
  8. # Area under the PR curve
  9. aupr = eval.evaluate(predictions,{eval.metricName:"areaUnderPR"})
  10. print "\n The AUROC is %s and the AUPR is %s" %(round(auroc,3), round(aupr,3))
  11. # True Positives
  12. tp= predictions[(predictions.label == 1) & (predictions.prediction == 1)].count()
  13. # True Negatives
  14. tn= predictions[(predictions.label == 0) & (predictions.prediction == 0)].count()
  15. # False Positives
  16. fp= predictions[(predictions.label == 0) & (predictions.prediction == 1)].count()
  17. # False Negatives
  18. fn= predictions[(predictions.label == 1) & (predictions.prediction == 0)].count()
  19. print "\n True Positives= %s; True Negatives= %s; False Positives= %s; False Negatives= %s" %(tp, tn, fp, fn)
  20. # Model Accuracy
  21. accuracy = float(tp+tn)/float(tp+tn+fp+fn)
  22. # Sensitivity/Recall
  23. recall = float(tp)/(tp+fn)
  24. # Specificity
  25. spec = float(tn)/(tn+fp)
  26. # Precision/PPV
  27. precision = float(tp)/float(tp+fp)
  28. # F-measure
  29. fscore = (2*recall*precision)/(recall+precision)
  30. # Matthews correlation coefficient
  31. MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn))
  32. # Cohen's kappa coefficient
  33. cohen_kappa = kappa(tp, tn, fp, fn)
  34. print "\n Accuracy= %s; Sensitivity= %s; Specificity= %s; Precision= %s \n F-measure= %s;
  35. Matthews correlation coefficient= %s; Cohen's Kappa coefficient= %s" %
  36. (round(accuracy*100,2),round(recall*100,2),round(spec*100,2),round(precision*100,2),round(fscore,4),
  37. round(MCC,4),round(cohen_kappa,4))