sparkSession使用:
package com.jdjr.city.demo import org.apache.spark.sql.SparkSession /** * @Auther: hongwei * @Date: 2018/11/9 16:31 * @Description: SparkSession使用 */ object Test4 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("ActionOperation") .master("local") .getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ val employee = spark.read.json("D:\\jdWork\\code\\MySpark\\src\\main\\resources\\1.txt") // collect:将分布式存储在集群上的分布式数据集(比如dataset)中的所有数据都获取到driver端来 employee.collect().foreach { println(_) } // count:对dataset中的记录数进行统计个数的操作 println(employee.count()) // first:获取数据集中的第一条数据 println(employee.first()) // foreach:遍历数据集中的每一条数据,对数据进行操作,这个跟collect不同,collect是将数据获取到driver端进行操作 // foreach是将计算操作推到集群上去分布式执行 // foreach(println(_))这种,真正在集群中执行的时候,是没用的,因为输出的结果是在分布式的集群中的,我们是看不到的 employee.foreach { println(_) } // reduce:对数据集中的所有数据进行归约的操作,多条变成一条 // 用reduce来实现数据集的个数的统计 //println(employee.map(employee => 1).reduce(_ + _)) // show,默认将dataset数据打印前20条 employee.show() // take,从数据集中获取指定条数 employee.take(3).foreach { println(_) } } }
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.jdjr.city</groupId> <artifactId>MySpark</artifactId> <version>1.0-SNAPSHOT</version> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> </repositories> <properties> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <encoding>UTF-8</encoding> <scala.version>2.11.8</scala.version> <spark.version>2.2.0</spark.version> <hadoop.version>2.6.4</hadoop.version> </properties> <dependencies> <!--<dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> </dependency>--> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.8</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> <dependency> <groupId>org.ansj</groupId> <artifactId>ansj_seg</artifactId> <version>5.0.4</version> </dependency> <dependency> <groupId>com.geccocrawler</groupId> <artifactId>gecco</artifactId> <version>1.0.8</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <!--scala 2.11不支持--> <!-- <arg>-make:transitive</arg>--> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.4.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.jdjr.city.demo.SearchPoi2</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
原文:https://blog.csdn.net/qq_22253209/article/details/83900319
http://dblab.xmu.edu.cn/blog/1086-2/ 厦门大学--Spark入门:连接Hive读写数据