package com.bjsxt.spark.transformations; import java.io.Serializable; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.List; import org.apache.cassandra.cli.CliParser.newColumnFamily_return; import org.apache.spark.Partitioner; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function2; import scala.Tuple2; class MySort implements Serializable,Comparator<Integer>{ /** * */ private static final long serialVersionUID = 1L; @Override public int compare(Integer o1, Integer o2) { return o2-o1; } } public class Operator_repartitionAndSortWithinPartition { public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("RepartitionAndSortWithinPartitionsOperator").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(sparkConf); List<Tuple2<Integer,Integer>> list = Arrays.asList( new Tuple2<Integer,Integer>(2, 3), new Tuple2<Integer,Integer>(1, 2), new Tuple2<Integer,Integer>(6, 7), new Tuple2<Integer,Integer>(3, 4), new Tuple2<Integer,Integer>(5, 6), new Tuple2<Integer,Integer>(4, 5) ); JavaPairRDD<Integer,Integer> rdd = sc.parallelizePairs(list,1); JavaPairRDD<Integer, Integer> rdd1 = rdd.repartitionAndSortWithinPartitions(new Partitioner() { /** * */ private static final long serialVersionUID = 1L; /** * 设置分区数据 * 0 1 2 * @return */ @Override public int numPartitions() { return 3; } @Override public int getPartition(Object key) { return Integer.valueOf(key+"") % numPartitions(); } },new MySort()); System.out.println("rdd1.partitions().size():" + rdd1.partitions().size()); rdd1.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer,Integer>>, Iterator<Tuple2<Integer,Integer>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Integer, Integer>> call(Integer v1, Iterator<Tuple2<Integer, Integer>> v2) throws Exception { while(v2.hasNext()){ System.out.println("partitionId:" + v1 + " value:" + v2.next()); } return v2; } }, true).count(); } }
原文地址:https://www.cnblogs.com/huiandong/p/9194809.html
时间: 2024-10-13 21:08:44