kafka源码解析之十二KafkaController(下篇)

12.6 KafkaController内部的listener

KafkaControler（leader）通过在zk的不同目录建立各种listener来达到对topic的管理和维护，其在zk的目录结构和对应的listener如下：

12.6.1 brokerChangeListener

/**
 * This is the zookeeper listener that triggers all the state transitions for a replica
 */
class BrokerChangeListener() extends IZkChildListener with Logging {
  this.logIdent = "[BrokerChangeListener on Controller " + controller.config.brokerId + "]: "
  def handleChildChange(parentPath : String, currentBrokerList : java.util.List[String]) {
    info("Broker change listener fired for path %s with children %s".format(parentPath, currentBrokerList.mkString(",")))
    inLock(controllerContext.controllerLock) {
      if (hasStarted.get) {
        ControllerStats.leaderElectionTimer.time {
          try {
            val curBrokerIds = currentBrokerList.map(_.toInt).toSet
            val newBrokerIds = curBrokerIds -- controllerContext.liveOrShuttingDownBrokerIds
            val newBrokerInfo = newBrokerIds.map(ZkUtils.getBrokerInfo(zkClient, _))
//筛选出newBroker
            val newBrokers = newBrokerInfo.filter(_.isDefined).map(_.get)
//筛选出deadBrokerIds
val deadBrokerIds = controllerContext.liveOrShuttingDownBrokerIds -- curBrokerIds
            controllerContext.liveBrokers = curBrokerIds.map(ZkUtils.getBrokerInfo(zkClient, _)).filter(_.isDefined).map(_.get)
            info("Newly added brokers: %s, deleted brokers: %s, all live brokers: %s"
              .format(newBrokerIds.mkString(","), deadBrokerIds.mkString(","), controllerContext.liveBrokerIds.mkString(",")))
//添加和newBroker的通信通道
            newBrokers.foreach(controllerContext.controllerChannelManager.addBroker(_))
//删除和newBroker的通信通道
            deadBrokerIds.foreach(controlleContext.controllerChannelManager.removeBroker(_))
            if(newBrokerIds.size > 0)
//尝试将该broker上的replica切换为online状态，并且恢复删除topic的流程
              controller.onBrokerStartup(newBrokerIds.toSeq)
            if(deadBrokerIds.size > 0)
//尝试将该broker上的replica切换为offline状态，并且标记该replica删除失败
              controller.onBrokerFailure(deadBrokerIds.toSeq)
          } catch {
            case e: Throwable => error("Error while handling broker changes", e)
          }
        }
      }
    }
  }
}

12.6.2 topicChangeListener

class TopicChangeListener extends IZkChildListener with Logging {
  this.logIdent = "[TopicChangeListener on Controller " + controller.config.brokerId + "]: "

  @throws(classOf[Exception])
  def handleChildChange(parentPath : String, children : java.util.List[String]) {
    inLock(controllerContext.controllerLock) {
      if (hasStarted.get) {
        try {
          val currentChildren = {
            import JavaConversions._
            debug("Topic change listener fired for path %s with children %s".format(parentPath, children.mkString(",")))
            (children: Buffer[String]).toSet
          }
//筛选出newTopics
          val newTopics = currentChildren -- controllerContext.allTopics
//筛选出deletedTopics
          val deletedTopics = controllerContext.allTopics -- currentChildren
          controllerContext.allTopics = currentChildren
//获取topic的assignment分配情况
          val addedPartitionReplicaAssignment = ZkUtils.getReplicaAssignmentForTopics(zkClient, newTopics.toSeq)
//剔除deletedTopics的replicaassignment
          controllerContext.partitionReplicaAssignment = controllerContext.partitionReplicaAssignment.filter(p =>
            !deletedTopics.contains(p._1.topic))
//增加newTopics的replicaassignment
          controllerContext.partitionReplicaAssignment.++=(addedPartitionReplicaAssignment)
          info("New topics: [%s], deleted topics: [%s], new partition replica assignment [%s]".format(newTopics,
            deletedTopics, addedPartitionReplicaAssignment))
          if(newTopics.size > 0)//创建topic
            controller.onNewTopicCreation(newTopics, addedPartitionReplicaAssignment.keySet.toSet)
        } catch {
          case e: Throwable => error("Error while handling new topic", e )
        }
      }
    }
  }

12.6.3 deleteTopicsListener

class DeleteTopicsListener() extends IZkChildListener with Logging {
  this.logIdent = "[DeleteTopicsListener on " + controller.config.brokerId + "]: "
  val zkClient = controllerContext.zkClient
  /**
   * Invoked when a topic is being deleted
   * @throws Exception On any error.
   */
  @throws(classOf[Exception])
  def handleChildChange(parentPath : String, children : java.util.List[String]) {
    inLock(controllerContext.controllerLock) {
      var topicsToBeDeleted = {
        import JavaConversions._
        (children: Buffer[String]).toSet
      }
      debug("Delete topics listener fired for topics %s to be deleted".format(topicsToBeDeleted.mkString(",")))
//过滤出不存在的topic
      val nonExistentTopics = topicsToBeDeleted.filter(t => !controllerContext.allTopics.contains(t))
      if(nonExistentTopics.size > 0) {
        warn("Ignoring request to delete non-existing topics " + nonExistentTopics.mkString(","))
        nonExistentTopics.foreach(topic => ZkUtils.deletePathRecursive(zkClient, ZkUtils.getDeleteTopicPath(topic)))
      }
//剔除不存在的topic
      topicsToBeDeleted --= nonExistentTopics
      if(topicsToBeDeleted.size > 0) {
        info("Starting topic deletion for topics " + topicsToBeDeleted.mkString(","))
        // mark topic ineligible for deletion if other state changes are in progress
        topicsToBeDeleted.foreach { topic =>
          val preferredReplicaElectionInProgress =
            controllerContext.partitionsUndergoingPreferredReplicaElection.map(_.topic).contains(topic)
          val partitionReassignmentInProgress =
            controllerContext.partitionsBeingReassigned.keySet.map(_.topic).contains(topic)
//如果topic的partition的replica正在重分配或者重新选举的话，则标识该topic不能被删除
          if(preferredReplicaElectionInProgress || partitionReassignmentInProgress)
            controller.deleteTopicManager.markTopicIneligibleForDeletion(Set(topic))
        }
//把topic交由deleteTopicManager处理
        controller.deleteTopicManager.enqueueTopicsForDeletion(topicsToBeDeleted)
      }
    }
  }
  @throws(classOf[Exception])
  def handleDataDeleted(dataPath: String) {
  }
}

12.6.4 preferredReplicaElectionListener

class PreferredReplicaElectionListener(controller: KafkaController) extends IZkDataListener with Logging {
  this.logIdent = "[PreferredReplicaElectionListener on " + controller.config.brokerId + "]: "
  val zkClient = controller.controllerContext.zkClient
  val controllerContext = controller.controllerContext
  @throws(classOf[Exception])
  def handleDataChange(dataPath: String, data: Object) {
    debug("Preferred replica election listener fired for path %s. Record partitions to undergo preferred replica election %s"
            .format(dataPath, data.toString))
    inLock(controllerContext.controllerLock) {
      val partitionsForPreferredReplicaElection = PreferredReplicaLeaderElectionCommand.parsePreferredReplicaElectionData(data.toString)
      if(controllerContext.partitionsUndergoingPreferredReplicaElection.size > 0)
        info("These partitions are already undergoing preferred replica election: %s"
          .format(controllerContext.partitionsUndergoingPreferredReplicaElection.mkString(",")))
//剔除正在PreferredReplicaElection的topic的partition
      val partitions = partitionsForPreferredReplicaElection -- controllerContext.partitionsUndergoingPreferredReplicaElection
//筛选出topic处于删除状态的topic and partition
      val partitionsForTopicsToBeDeleted = partitions.filter(p => controller.deleteTopicManager.isTopicQueuedUpForDeletion(p.topic))
      if(partitionsForTopicsToBeDeleted.size > 0) {
        error("Skipping preferred replica election for partitions %s since the respective topics are being deleted"
          .format(partitionsForTopicsToBeDeleted))
      }
//剩余的topic and partition才是真正需要PreferredReplicaElection
      controller.onPreferredReplicaElection(partitions -- partitionsForTopicsToBeDeleted)
    }
  }
  @throws(classOf[Exception])
  def handleDataDeleted(dataPath: String) {
  }
}

12.6.5 partitionReassignedListener

class PartitionsReassignedListener(controller: KafkaController) extends IZkDataListener with Logging {
  this.logIdent = "[PartitionsReassignedListener on " + controller.config.brokerId + "]: "
  val zkClient = controller.controllerContext.zkClient
  val controllerContext = controller.controllerContext
  @throws(classOf[Exception])
  def handleDataChange(dataPath: String, data: Object) {
    debug("Partitions reassigned listener fired for path %s. Record partitions to be reassigned %s"
      .format(dataPath, data))
    val partitionsReassignmentData = ZkUtils.parsePartitionReassignmentData(data.toString)
//剔除正在重分配的partition
    val partitionsToBeReassigned = inLock(controllerContext.controllerLock) {
      partitionsReassignmentData.filterNot(p => controllerContext.partitionsBeingReassigned.contains(p._1))
    }
//剔除partition的topic处于删除状态的
    partitionsToBeReassigned.foreach { partitionToBeReassigned =>
      inLock(controllerContext.controllerLock) {
        if(controller.deleteTopicManager.isTopicQueuedUpForDeletion(partitionToBeReassigned._1.topic)) {
          error("Skipping reassignment of partition %s for topic %s since it is currently being deleted"
            .format(partitionToBeReassigned._1, partitionToBeReassigned._1.topic))
          controller.removePartitionFromReassignedPartitions(partitionToBeReassigned._1)
        } else {//开始进行真正的partition的reassigned动作
          val context = new ReassignedPartitionsContext(partitionToBeReassigned._2)
          controller.initiateReassignReplicasForTopicPartition(partitionToBeReassigned._1, context)
        }
      }
    }
  }
  @throws(classOf[Exception])
  def handleDataDeleted(dataPath: String) {
  }
}

Partition的reassign比较复杂，因此详细叙述下，继续往下看：

def initiateReassignReplicasForTopicPartition(topicAndPartition: TopicAndPartition,
                                      reassignedPartitionContext: ReassignedPartitionsContext) {
  val newReplicas = reassignedPartitionContext.newReplicas
  val topic = topicAndPartition.topic
  val partition = topicAndPartition.partition
//过滤出有效的topic and partition的replicas
  val aliveNewReplicas = newReplicas.filter(r => controllerContext.liveBrokerIds.contains(r))
  try {
    val assignedReplicasOpt = controllerContext.partitionReplicaAssignment.get(topicAndPartition)
    assignedReplicasOpt match {
      case Some(assignedReplicas) =>
        if(assignedReplicas == newReplicas) {//和之前的对比，如果一致，则不需要reassign
          throw new KafkaException("Partition %s to be reassigned is already assigned to replicas".format(topicAndPartition) +
            " %s. Ignoring request for partition reassignment".format(newReplicas.mkString(",")))
        } else {
          if(aliveNewReplicas == newReplicas) {// 如果reassign的replicas全部是在线状态的话，则执行reassign动作
            info("Handling reassignment of partition %s to new replicas %s".format(topicAndPartition, newReplicas.mkString(",")))
            //针对该partition的isr路径注册watch，检测它的变化，注意该listener为ReassignedPartitionsIsrChangeListener
            watchIsrChangesForReassignedPartition(topic, partition, reassignedPartitionContext)
//标记该topic and partition处于reassigned状态
            controllerContext.partitionsBeingReassigned.put(topicAndPartition, reassignedPartitionContext)
            //标记topic为非法，防止中途被删除
            deleteTopicManager.markTopicIneligibleForDeletion(Set(topic))
//真正地执行reassigned动作
            onPartitionReassignment(topicAndPartition, reassignedPartitionContext)
          } else {//有一些reassign的replica是离线状态，因此reassign失败
            // some replica in RAR is not alive. Fail partition reassignment
            throw new KafkaException("Only %s replicas out of the new set of replicas".format(aliveNewReplicas.mkString(",")) +
              " %s for partition %s to be reassigned are alive. ".format(newReplicas.mkString(","), topicAndPartition) +
              "Failing partition reassignment")
          }
        }
//找不到该topic and partition
      case None => throw new KafkaException("Attempt to reassign partition %s that doesn't exist"
        .format(topicAndPartition))
    }
  } catch {//只要发生异常，则从reassignedpartitions中删除
    case e: Throwable => error("Error completing reassignment of partition %s".format(topicAndPartition), e)
    // remove the partition from the admin path to unblock the admin client
    removePartitionFromReassignedPartitions(topicAndPartition)
  }
}

这其中最主要的流程是onPartitionReassignment内部的逻辑，如下：

/*
*1.首先解释下名词：
* RAR = Reassigned replicas（replicas的重分配情况）
* OAR = Original list of replicas for partition（replicas的初始状态）
* AR = current assigned replicas
*/
def onPartitionReassignment(topicAndPartition: TopicAndPartition, reassignedPartitionContext: ReassignedPartitionsContext) {
  val reassignedReplicas = reassignedPartitionContext.newReplicas
  areReplicasInIsr(topicAndPartition.topic, topicAndPartition.partition, reassignedReplicas) match {
    case false =>//发现new replicas不在之前该partition的isr中，表明没有同步上最新数据，则首先应该让这些new replicas同步上该partition的数据
      info("New replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
        "reassigned not yet caught up with the leader")
      val newReplicasNotInOldReplicaList = reassignedReplicas.toSet -- controllerContext.partitionReplicaAssignment(topicAndPartition).toSet
      val newAndOldReplicas = (reassignedPartitionContext.newReplicas ++ controllerContext.partitionReplicaAssignment(topicAndPartition)).toSet
      //因此先把该partition的replicas置为 newAndOldReplicas
      updateAssignedReplicasForPartition(topicAndPartition, newAndOldReplicas.toSeq)
      //向这些replicas所在的broker发送 LeaderAndIsrRequest请求
      updateLeaderEpochAndSendRequest(topicAndPartition, controllerContext.partitionReplicaAssignment(topicAndPartition),
        newAndOldReplicas.toSeq)
//置newReplicasNotInOldReplicaList的状态为NewReplica，那么程序在这里之后是如何运行的呢？
//注意在这之前，KafkaControler在/brokers/topics/[topic]/partitions/[partitionId]/state注册了ReassignedPartitionsIsrChangeListener
//函数，当新增的replicas同步上这个partition的leader之后，KafkaController更新对应的isr时会进一步触发//ReassignedPartitionsIsrChangeListener，且看ReassignedPartitionsIsrChangeListener的实现
      startNewReplicasForReassignedPartition(topicAndPartition, reassignedPartitionContext, newReplicasNotInOldReplicaList)
      info("Waiting for new replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
        "reassigned to catch up with the leader")
    case true =>//此时new replicas已经全部同步上了
      //过滤出旧的replicas
      val oldReplicas = controllerContext.partitionReplicaAssignment(topicAndPartition).toSet -- reassignedReplicas.toSet
      //将resignedReplicas的状态置为OnlineReplica
      reassignedReplicas.foreach { replica =>
        replicaStateMachine.handleStateChanges(Set(new PartitionAndReplica(topicAndPartition.topic, topicAndPartition.partition,
          replica)), OnlineReplica)
      }
      //按需确定新的leader，如果leader在newreplicas中，则保持不变，如果不在，则重新选举
      moveReassignedPartitionLeaderIfRequired(topicAndPartition, reassignedPartitionContext)
      //删除旧的replicas
      stopOldReplicasOfReassignedPartition(topicAndPartition, reassignedPartitionContext, oldReplicas)
      //在kafkaController cache和zk上更新topicAndPartition的replicas
      updateAssignedReplicasForPartition(topicAndPartition, reassignedReplicas)
      //更新zk上的/admin/reassign_partitions内容，删除该topicAndPartition
      removePartitionFromReassignedPartitions(topicAndPartition)
      info("Removed partition %s from the list of reassigned partitions in zookeeper".format(topicAndPartition))
      controllerContext.partitionsBeingReassigned.remove(topicAndPartition)
      //发送UpdateMetadataRequest给broker
      sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicAndPartition))
      //恢复删除topic的流程，可能该topic的partition在重分配之后需要被删除
      deleteTopicManager.resumeDeletionForTopics(Set(topicAndPartition.topic))
  }
}

当新的replics同步上对应partition的leader之后，会在/brokers/topics/[topic]/partitions/[partitionId]/state路径更新对应partition的状态，此时触发ReassignedPartitionsIsrChangeListener的回调函数

class ReassignedPartitionsIsrChangeListener(controller: KafkaController, topic: String, partition: Int,
                                            reassignedReplicas: Set[Int])
  extends IZkDataListener with Logging {
  this.logIdent = "[ReassignedPartitionsIsrChangeListener on controller " + controller.config.brokerId + "]: "
  val zkClient = controller.controllerContext.zkClient
  val controllerContext = controller.controllerContext
  @throws(classOf[Exception])
  def handleDataChange(dataPath: String, data: Object) {
    inLock(controllerContext.controllerLock) {
      debug("Reassigned partitions isr change listener fired for path %s with children %s".format(dataPath, data))
      val topicAndPartition = TopicAndPartition(topic, partition)
      try {
        controllerContext.partitionsBeingReassigned.get(topicAndPartition) match {
          case Some(reassignedPartitionContext) =>
            val newLeaderAndIsrOpt = ZkUtils.getLeaderAndIsrForPartition(zkClient, topic, partition)
            newLeaderAndIsrOpt match {
              case Some(leaderAndIsr) =>
                val caughtUpReplicas = reassignedReplicas & leaderAndIsr.isr.toSet
                if(caughtUpReplicas == reassignedReplicas) {//如果reassigned的replicas全部处于isr之中的话，说明新增的replicas已经追上了其partition的leader
                  info("%d/%d replicas have caught up with the leader for partition %s being reassigned."
                    .format(caughtUpReplicas.size, reassignedReplicas.size, topicAndPartition) +
                    "Resuming partition reassignment")
//则再一次进入onPartitionReassignment处理流程，
//此时areReplicasInIsr(topicAndPartition.topic, topicAndPartition.partition, reassignedReplicas)为true
                  controller.onPartitionReassignment(topicAndPartition, reassignedPartitionContext)
                }
                else {
                  info("%d/%d replicas have caught up with the leader for partition %s being reassigned."
                    .format(caughtUpReplicas.size, reassignedReplicas.size, topicAndPartition) +
                    "Replica(s) %s still need to catch up".format((reassignedReplicas -- leaderAndIsr.isr.toSet).mkString(",")))
                }
              case None => error("Error handling reassignment of partition %s to replicas %s as it was never created"
                .format(topicAndPartition, reassignedReplicas.mkString(",")))
            }
          case None =>
        }
      } catch {
        case e: Throwable => error("Error while handling partition reassignment", e)
      }
    }
  }
  @throws(classOf[Exception])
  def handleDataDeleted(dataPath: String) {
  }
}

12.6.6 AddPartitionsListener

class AddPartitionsListener(topic: String) extends IZkDataListener with Logging {
  this.logIdent = "[AddPartitionsListener on " + controller.config.brokerId + "]: "
  @throws(classOf[Exception])
  def handleDataChange(dataPath : String, data: Object) {
    inLock(controllerContext.controllerLock) {
      try {
        info("Add Partition triggered " + data.toString + " for path " + dataPath)
        val partitionReplicaAssignment = ZkUtils.getReplicaAssignmentForTopics(zkClient, List(topic))
//过滤出新增的partition
        val partitionsToBeAdded = partitionReplicaAssignment.filter(p =>
          !controllerContext.partitionReplicaAssignment.contains(p._1))
//如果新增的partition的topic正在删除中的话，则忽略，否则开始创建新的partition
        if(controller.deleteTopicManager.isTopicQueuedUpForDeletion(topic))
          error("Skipping adding partitions %s for topic %s since it is currently being deleted"
                .format(partitionsToBeAdded.map(_._1.partition).mkString(","), topic))
        else {
          if (partitionsToBeAdded.size > 0) {
            info("New partitions to be added %s".format(partitionsToBeAdded))
            controller.onNewPartitionCreation(partitionsToBeAdded.keySet.toSet)
          }
        }
      } catch {
        case e: Throwable => error("Error while handling add partitions for data path " + dataPath, e )
      }
    }
  }
  @throws(classOf[Exception])
  def handleDataDeleted(parentPath : String) {
    // this is not implemented for partition change
  }
}

12.7 KafkaController内部rebalance流程

那什么是rebalance呢？rebalance就是当topic and partition的leader发生变化时，造成在集群内部分布不均，需要重新调整topic and partition的leader为原始状态，使负载均衡，即如下的过程：

Topic And Partition	Leader	ISR
[topic] partition 0	1	1,2,
[topic] partition 1	2	2,3
[topic] partition 2	3	3,4
[topic] partition 3	4	4,1

每个Broker都存在一个leader，则当broker 4离线了一段时间后再上线时，其topic and partition的变化如下：

Topic And Partition	Leader	ISR
[topic] partition 0	1	1,2,
[topic] partition 1	2	2,3
[topic] partition 2	3	3,4
[topic] partition 3	1	4,1

在Broker 1上出现了2个leader，即partition 0和partition 3的leader位于broker 1了。则接着broker 2离线了一段时间后再上线时，其topic and partition的变化如下：

Topic And Partition	Leader	ISR
[topic] partition 0	1	1,2,
[topic] partition 1	3	2,3
[topic] partition 2	3	3,4
[topic] partition 3	1	4,1

此时leader都集中在了broker 1和broker 3上，其它节点没有leader了，那么这个时候生成者都会把数据发生给broker 1和broker 3，造成该两个节点负载比较大，如果此时配置了auto.leader.rebalance.enable=true的话，即开了负载均衡的功能的话，topic and partition的leader会发生迁移，会尽量恢复成系统初始的状态，即如下：

Topic And Partition	Leader	ISR
[topic] partition 0	1	1,2,
[topic] partition 1	2	2,3
[topic] partition 2	3	3,4
[topic] partition 3	4	4,1

即定时任务checkAndTriggerPartitionRebalance

private def checkAndTriggerPartitionRebalance(): Unit = {
  if (isActive()) {
    trace("checking need to trigger partition rebalance")
    // 获取所有在线的broker的replicas
    var preferredReplicasForTopicsByBrokers: Map[Int, Map[TopicAndPartition, Seq[Int]]] = null
    inLock(controllerContext.controllerLock) {
      preferredReplicasForTopicsByBrokers =
        controllerContext.partitionReplicaAssignment.filterNot(p => deleteTopicManager.isTopicQueuedUpForDeletion(p._1.topic)).groupBy {
          case(topicAndPartition, assignedReplicas) => assignedReplicas.head
        }
    }
    debug("preferred replicas by broker " + preferredReplicasForTopicsByBrokers)
    // for each broker, check if a preferred replica election needs to be triggered
    preferredReplicasForTopicsByBrokers.foreach {
      case(leaderBroker, topicAndPartitionsForBroker) => {
        var imbalanceRatio: Double = 0
        var topicsNotInPreferredReplica: Map[TopicAndPartition, Seq[Int]] = null
        inLock(controllerContext.controllerLock) {
//过滤出leader不在PreferredReplica的head的topics
          topicsNotInPreferredReplica =
            topicAndPartitionsForBroker.filter {
              case(topicPartition, replicas) => {
                controllerContext.partitionLeadershipInfo.contains(topicPartition) &&
                //leaderAndIsr.leader != leaderBroker(目前的leader和原本的assignedReplicas的第一个broker不一样)
                controllerContext.partitionLeadershipInfo(topicPartition).leaderAndIsr.leader != leaderBroker
              }
            }
          debug("topics not in preferred replica " + topicsNotInPreferredReplica)
          val totalTopicPartitionsForBroker = topicAndPartitionsForBroker.size
          val totalTopicPartitionsNotLedByBroker = topicsNotInPreferredReplica.size
//计算不平衡度
          imbalanceRatio = totalTopicPartitionsNotLedByBroker.toDouble / totalTopicPartitionsForBroker
          trace("leader imbalance ratio for broker %d is %f".format(leaderBroker, imbalanceRatio))
        }
        //如果不平衡读到达某个程度，则触发均衡
        if (imbalanceRatio > (config.leaderImbalancePerBrokerPercentage.toDouble / 100)) {
          topicsNotInPreferredReplica.foreach {
            case(topicPartition, replicas) => {
              inLock(controllerContext.controllerLock) {
                if (controllerContext.liveBrokerIds.contains(leaderBroker) &&// leaderBroker必须是在线状态
                    controllerContext.partitionsBeingReassigned.size == 0 &&//没有partition在进行重分配，避免加重系统负担
                    controllerContext.partitionsUndergoingPreferredReplicaElection.size == 0 &&//没有partition在被重新选举leader
                    !deleteTopicManager.isTopicQueuedUpForDeletion(topicPartition.topic) &&//该topic不需要删除
                    controllerContext.allTopics.contains(topicPartition.topic)) {//该topic有效
                  onPreferredReplicaElection(Set(topicPartition), true)//则触发对这个topic and partition的PreferredReplicaElection过程
                }
              }
            }
          }
        }
      }
    }
  }
}

12.8 KafkaController内部topic删除流程TopicDeletionManager

本质是就是开启DeleteTopicsThread线程，然后等待KafakController触发删除

class DeleteTopicsThread() extends ShutdownableThread(name = "delete-topics-thread-" + controller.config.brokerId, isInterruptible = false) {
  val zkClient = controllerContext.zkClient
  override def doWork() {
    awaitTopicDeletionNotification()//等待KafakController触发删除
if (!isRunning.get)
      return
    inLock(controllerContext.controllerLock) {
      val topicsQueuedForDeletion = Set.empty[String] ++ topicsToBeDeleted
      if(!topicsQueuedForDeletion.isEmpty)
        info("Handling deletion for topics " + topicsQueuedForDeletion.mkString(","))
      topicsQueuedForDeletion.foreach { topic =>
      //由于是异步流程，则当topic的每个partition的replicas成功删除之后
        if(controller.replicaStateMachine.areAllReplicasForTopicDeleted(topic)) {
          //此时清除KafakController内部关于该topic的任何信息
          completeDeleteTopic(topic)
          info("Deletion of topic %s successfully completed".format(topic))
        } else {//忽略topic正在删除的状态
          if(controller.replicaStateMachine.isAtLeastOneReplicaInDeletionStartedState(topic)) {
            // ignore since topic deletion is in progress
            val replicasInDeletionStartedState = controller.replicaStateMachine.replicasInState(topic, ReplicaDeletionStarted)
            val replicaIds = replicasInDeletionStartedState.map(_.replica)
            val partitions = replicasInDeletionStartedState.map(r => TopicAndPartition(r.topic, r.partition))
            info("Deletion for replicas %s for partition %s of topic %s in progress".format(replicaIds.mkString(","),
              partitions.mkString(","), topic))
          } else {
            //删除replica出现意外，应该重试
            if(controller.replicaStateMachine.isAnyReplicaInState(topic, ReplicaDeletionIneligible)) {
              markTopicForDeletionRetry(topic)
            }
          }
        }
        //如果topic可以删除的话，则开始删除该topic，最重要的动作就是向该topic所在的所有broker发送StopReplicaRequest，
//通知各个broker停止同步并且删除对应的replica
        if(isTopicEligibleForDeletion(topic)) {
          info("Deletion of topic %s (re)started".format(topic))
          // topic deletion will be kicked off
          onTopicDeletion(Set(topic))
        } else if(isTopicIneligibleForDeletion(topic)) {
          info("Not retrying deletion of topic %s at this time since it is marked ineligible for deletion".format(topic))
        }
      }
    }
  }
}

12.9 KafkaController(leader)和其它broker通信流程ControllerChannelManager

ControllerChannelManager保存了和各个broker通信的通道：

class ControllerChannelManager (private val controllerContext: ControllerContext, config: KafkaConfig) extends Logging {
  private val brokerStateInfo = new HashMap[Int, ControllerBrokerStateInfo]
}
且看ControllerBrokerStateInfo类：
case class ControllerBrokerStateInfo(channel: BlockingChannel,
                                     broker: Broker,
                                     messageQueue: BlockingQueue[(RequestOrResponse, (RequestOrResponse) => Unit)],
                                     requestSendThread: RequestSendThread)
其messageQueue存放了发往特定broker的消息，其每个消息对应一个cb回调函数，channel为和broker通信的链路，RequestSendThread为其发送线程，查看requestSendThread发送线程：
class RequestSendThread(val controllerId: Int,
                        val controllerContext: ControllerContext,
                        val toBroker: Broker,
                        val queue: BlockingQueue[(RequestOrResponse, (RequestOrResponse) => Unit)],
                        val channel: BlockingChannel)
  extends ShutdownableThread("Controller-%d-to-broker-%d-send-thread".format(controllerId, toBroker.id)) {
  private val lock = new Object()
  private val stateChangeLogger = KafkaController.stateChangeLogger
  connectToBroker(toBroker, channel)

  override def doWork(): Unit = {
    val queueItem = queue.take()//获取请求
    val request = queueItem._1
    val callback = queueItem._2
    var receive: Receive = null
    try {
      lock synchronized {
        var isSendSuccessful = false
        while(isRunning.get() && !isSendSuccessful) {
          // if a broker goes down for a long time, then at some point the controller's zookeeper listener will trigger a
          // removeBroker which will invoke shutdown() on this thread. At that point, we will stop retrying.
          try {
            channel.send(request)//发送请求
            receive = channel.receive()//获取响应
            isSendSuccessful = true
          } catch {
            case e: Throwable => // if the send was not successful, reconnect to broker and resend the message
              warn(("Controller %d epoch %d fails to send request %s to broker %s. " +
                "Reconnecting to broker.").format(controllerId, controllerContext.epoch,
                request.toString, toBroker.toString()), e)
              channel.disconnect()
              connectToBroker(toBroker, channel)
              isSendSuccessful = false
              // backoff before retrying the connection and send
              Utils.swallow(Thread.sleep(300))
          }
        }
        var response: RequestOrResponse = null
        request.requestId.get match {// 转化不同的response
          case RequestKeys.LeaderAndIsrKey =>
            response = LeaderAndIsrResponse.readFrom(receive.buffer)
          case RequestKeys.StopReplicaKey =>
            response = StopReplicaResponse.readFrom(receive.buffer)
          case RequestKeys.UpdateMetadataKey =>
            response = UpdateMetadataResponse.readFrom(receive.buffer)
        }
        stateChangeLogger.trace("Controller %d epoch %d received response %s for a request sent to broker %s"
                                  .format(controllerId, controllerContext.epoch, response.toString, toBroker.toString))
        //如果设置了回调函数，则触发回调
        if(callback != null) {
          callback(response)
        }
      }
    } catch {
      case e: Throwable =>
        error("Controller %d fails to send a request to broker %s".format(controllerId, toBroker.toString()), e)
        // If there is any socket error (eg, socket timeout), the channel is no longer usable and needs to be recreated.
        channel.disconnect()
    }
  }

时间： 2024-07-29 18:38:19

kafka源码解析之十二KafkaController(下篇)

12.6 KafkaController内部的listener

12.6.1 brokerChangeListener

12.6.2 topicChangeListener

12.6.3 deleteTopicsListener

12.6.4 preferredReplicaElectionListener

12.6.5 partitionReassignedListener

12.6.6 AddPartitionsListener

12.7 KafkaController内部rebalance流程

12.8 KafkaController内部topic删除流程TopicDeletionManager

12.9 KafkaController(leader)和其它broker通信流程ControllerChannelManager

kafka源码解析之十二KafkaController(下篇)的相关文章

kafka源码解析之十二KafkaController(中篇)

Mybaits 源码解析（十二）----- Mybatis的事务如何被Spring管理？Mybatis和Spring事务中用的Connection是同一个吗？

hbase源码系列（十二）Get、Scan在服务端是如何处理？

【原】Android热更新开源项目Tinker源码解析系列之二:资源文件热更新

libevent源码深度剖析十二

JDK8源码解析 -- HashMap（二）

android源码解析之（二）-->异步任务AsyncTask

Redis源码解析（十五）--- aof-append only file解析

android源码解析（十九）-->Dialog加载绘制流程