Dataset: Diabetes data from https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data

Load it in medical_data folder in hdfs

scala> import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
scala> import org.apache.spark.mllib.linalg.Vectors
scala> import org.apache.spark.mllib.regression.LabeledPoint
 
scala> val data = sc.textFile("medical_data")
scala> val parsedData = data.map { line =>
scala> val parts = line.split(",")
scala> LabeledPoint(parts.last.toDouble,Vectors.dense(parts.take(8).map(_.toDouble))) }
scala> parsedData first
scala> val splits = parsedData.randomSplit(Array(0.7, 0.3), seed =11L)
scala> val training = splits(0)
scala> val test = splits(1)
 
scala> val lr = new LogisticRegressionWithLBFGS().setIntercept(true)
scala> val model = lr.run(training)
scala> val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
scala> val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
Top