1.蓄水池抽样算法(Reservoir Sampling)
https://www.jianshu.com/p/7a9ea6ece2af
2.spark抽样之蓄水池抽样
https://blog.csdn.net/snaillup/article/details/69524931?utm_source=blogxgwz3
代码:
/** * Reservoir sampling implementation that also returns the input size. * * @param input input size * @param k reservoir size * @param seed random seed * @return (samples, input size) */ def reservoirSampleAndCount[T: ClassTag]( input: Iterator[T], k: Int, seed: Long = Random.nextLong()) : (Array[T], Long) = { val reservoir = new Array[T](k) // Put the first k elements in the reservoir. var i = 0 while (i < k && input.hasNext) { val item = input.next() reservoir(i) = item i += 1 } // If we have consumed all the elements, return them. Otherwise do the replacement. if (i < k) { // If input size < k, trim the array to return only an array of input size. val trimReservoir = new Array[T](i) System.arraycopy(reservoir, 0, trimReservoir, 0, i) (trimReservoir, i) } else { // If input size > k, continue the sampling process. var l = i.toLong val rand = new XORShiftRandom(seed) while (input.hasNext) { val item = input.next() l += 1 // There are k elements in the reservoir, and the l-th element has been // consumed. It should be chosen with probability k/l. The expression // below is a random long chosen uniformly from [0,l) val replacementIndex = (rand.nextDouble() * l).toLong if (replacementIndex < k) { reservoir(replacementIndex.toInt) = item } } (reservoir, l) } }
原文地址:https://www.cnblogs.com/moonlightml/p/10165585.html
时间: 2024-10-05 23:25:23