1. 创建一个 Spark SQLContext
from pyspark.sql import SQLContext sqlContext = SQLContext(sc)
df = sqlContext.read.json('./filename.json')
df.show()
from pyspark.mllib.stat import Stastics dataMatrix f= sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]]) summary = Statics.colStats(dataMatrix) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros())
from pyspark.mllib.tree import DecisionTree, DdecisionTreeModel
from pyspark.mllib.util import MLUtils
data = sc.textFile("data.txt")
model = DecisionTree.trainClassifier(parsedData, numClasses = 2)
print(model.toDebugStrinh())
model.save(sc, 'decisionTreeModel')
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
data = sc.textFile("data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.splie()]))
clusters = KMeans.train(parsedData, k = 3)
print(clusters.centers)