下面介绍使用3种不同数据类型的WordCount程序
a,b c,a d,b a,d
使用SparkContext
sc.textFile("./data/dataset/data.txt")
RDD
object WordCount { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("wc").setMaster("local[*]")) sc.textFile("./data/dataset/data.txt") .flatMap(_.split(",")) .map((_, 1)) .reduceByKey(_ + _) .foreach(println) sc.stop() } }(d,2) (a,3) (b,2) (c,1)
使用SparkSession
spark.read.text("./src/main/data/data.txt")
DataFrame
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} object WordCount { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName(name = s"Read_Write") .getOrCreate() import spark.implicits._ val data = spark.read.text("./src/main/data/data.txt") .map(line=>{ val colArr =line.getString(0) .split(",") val col1 = colArr(0) val col2 = colArr(1) (col1) }) .union(spark.read.text("./src/main/data/data.txt") .map(line=>{ val colArr =line.getString(0) .split(",") val col1 = colArr(0) val col2 = colArr(1) (col2) })) .groupBy("value") .count() .show() } }±----±----+ |value|count| ±----±----+ | d| 2| | c| 1| | b| 2| | a| 3| ±----±----+
使用SparkSession
spark.read.textFile("./src/main/data/data.txt")
DataSet
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} object WordCount { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName(name = s"Read_Write") .getOrCreate() import spark.implicits._ val data_2 = spark.read.textFile("./src/main/data/data.txt") .flatMap(value => value.split(",")) .groupByKey(_.toLowerCase) .count() .show() spark.stop() } }±----±-------+ |value|count(1)| ±----±-------+ | d| 2| | c| 1| | b| 2| | a| 3| ±----±-------+
使用SparkSession
spark.read.text("./src/main/data/data.txt").as[String] 将DataFrame转换成DataSet
DataSet
//DataSet2版 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} object WordCount { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[*]") .appName(name = s"Read_Write") .getOrCreate() import spark.implicits._ val data_3 = spark.read.text("./src/main/data/data.txt").as[String] val words = data_3.flatMap(value => value.split(",")) val groupedWords = words.groupByKey(_.toLowerCase) val counts = groupedWords.count() counts.show() spark.stop() } }