1.dataframe 基本操作
def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("test") .master("local[*]") .getOrCreate() import spark.implicits._ val people = spark.read.format("json").load("people.json") people.show() /* +----+-------+ | age| name| +----+-------+ |null|Michael| | 30| Andy| | 19| Justin| +----+-------+ */ people.printSchema() /*root |-- age: long (nullable = true) |-- name: string (nullable = true)*/ people.select($"name").show() /* +-------+ | name| +-------+ |Michael| | Andy| | Justin| +-------+*/ people.select($"name", $"age".cast("string").as("age")).printSchema() /* root |-- name: string (nullable = true) |-- age: string (nullable = true)*/ people.select($"name", ($"age" + 1).as("age")).show() /* +-------+----+ | name| age| +-------+----+ |Michael|null| | Andy| 31| | Justin| 20| +-------+----+*/ people.filter($"age" > 21).show() // +---+----+ // |age|name| // +---+----+ // | 30|Andy| // +---+----+ people.groupBy("age").count().show() // +----+-----+ // | age|count| // +----+-----+ // | 19| 1| // |null| 1| // | 30| 1| // +----+-----+ spark.stop() }
原文地址:https://www.cnblogs.com/jason-dong/p/9864977.html
时间: 2024-11-08 22:35:30