Kafka - 网络通信模型

Kafka的网络通信模型是基于Java NIO的Reactor多线程模型实现的。

从Kakfa的SocketServer.scala中可以看到一段关于Kafka网络模型的说明。

/**
 * An NIO socket server. The threading model is
 *   1 Acceptor thread that handles new connections
 *   Acceptor has N Processor threads that each have their own selector and read requests from sockets
 *   M Handler threads that handle requests and produce responses back to the processor threads for writing.
 */

Kafka包含了1个Acceptor线程用来接收新连接、N个Processor线程用来处理Socket请求、M个Handler线程用来处理业务逻辑。

首先，对比一下几种NIO模型。

普通NIO
高并发NIO
Kafka NIO

接着，从源码层面来分析。

Broker启动的时候，会创建Acceptor以及Processor，并初始化KafkaApis及请求处理池。

// Create and start the socket server acceptor threads so that the bound port is known.
// Delay starting processors until the end of the initialization sequence to ensure
// that credentials have been loaded before processing authentications.
//启动Acceptor，绑定端口
socketServer = new SocketServer(config, metrics, time, credentialProvider)
socketServer.startup(startupProcessors = false)

/* start processing requests */
apis = new KafkaApis(socketServer.requestChannel, replicaManager, adminManager, groupCoordinator, transactionCoordinator,
  kafkaController, zkClient, config.brokerId, config, metadataCache, metrics, authorizer, quotaManagers,
  fetchManager, brokerTopicStats, clusterId, time, tokenManager)

1	requestHandlerPool = new KafkaRequestHandlerPool(config.brokerId, socketServer.requestChannel, apis, time, config.numIoThreads)

startup方法创建了连接数管理器、启动Acceptor线程及Processor线程。

def startup(startupProcessors: Boolean = true) {
  this.synchronized {
    //用于维护单IP下的连接数，防止资源过载
    connectionQuotas = new ConnectionQuotas(maxConnectionsPerIp, maxConnectionsPerIpOverrides)
    createAcceptorAndProcessors(config.numNetworkThreads, config.listeners)
    if (startupProcessors) {
      startProcessors()
    }
  }
}

private def createAcceptorAndProcessors(processorsPerListener: Int,
                                        endpoints: Seq[EndPoint]): Unit = synchronized {

  val sendBufferSize = config.socketSendBufferBytes
  val recvBufferSize = config.socketReceiveBufferBytes
  val brokerId = config.brokerId
  //遍历server.properties配置的listeners属性，Kafka单机支持多协议、多端口
  endpoints.foreach { endpoint =>
    val listenerName = endpoint.listenerName
    val securityProtocol = endpoint.securityProtocol
    //创建Acceptor线程，配置socket buffer，并开启nioSelector，启动端口监听客户端
    val acceptor = new Acceptor(endpoint, sendBufferSize, recvBufferSize, brokerId, connectionQuotas)
    //添加连接处理器Processor
    addProcessors(acceptor, endpoint, processorsPerListener)
    KafkaThread.nonDaemon(s"kafka-socket-acceptor-$listenerName-$securityProtocol-${endpoint.port}", acceptor).start()
    acceptor.awaitStartup()
    acceptors.put(endpoint, acceptor)
  }
}

在Acceptor线程内部，不断循环，监听OP_ACCEPT事件，再将请求交给Processor去处理I/O。

/**
 * Accept loop that checks for new connection attempts
 */
def run() {
  serverChannel.register(nioSelector, SelectionKey.OP_ACCEPT)
  startupComplete()
  try {
    var currentProcessor = 0
    while (isRunning) {
      try {
        val ready = nioSelector.select(500)
        if (ready > 0) {
          val keys = nioSelector.selectedKeys()
          val iter = keys.iterator()
          while (iter.hasNext && isRunning) {
            try {
              val key = iter.next
              iter.remove()
              if (key.isAcceptable) {
                val processor = synchronized {
                  currentProcessor = currentProcessor % processors.size
                  processors(currentProcessor)
                }
                  //获取连接
                  accept(key, processor)
              } else
                throw new IllegalStateException("Unrecognized key state for acceptor thread.")

                // 轮询到下一个Processor
                currentProcessor = currentProcessor + 1
            } catch {
              case e: Throwable => error("Error while accepting connection", e)
            }
          }
        }
      }
      catch {
        // We catch all the throwables to prevent the acceptor thread from exiting on exceptions due
        // to a select operation on a specific channel or a bad request. We don't want
        // the broker to stop responding to requests from other clients in these scenarios.
        case e: ControlThrowable => throw e
        case e: Throwable => error("Error occurred", e)
      }
    }
  } finally {
    debug("Closing server socket and selector.")
    CoreUtils.swallow(serverChannel.close(), this, Level.ERROR)
    CoreUtils.swallow(nioSelector.close(), this, Level.ERROR)
    shutdownComplete()
  }
}

/*
 * Accept a new connection
 */
def accept(key: SelectionKey, processor: Processor) {
  val serverSocketChannel = key.channel().asInstanceOf[ServerSocketChannel]
  //监听新连接
  val socketChannel = serverSocketChannel.accept()
  try {
    //增加连接数
    connectionQuotas.inc(socketChannel.socket().getInetAddress)
    socketChannel.configureBlocking(false)
    socketChannel.socket().setTcpNoDelay(true)
    socketChannel.socket().setKeepAlive(true)
    if (sendBufferSize != Selectable.USE_DEFAULT_BUFFER_SIZE)
      socketChannel.socket().setSendBufferSize(sendBufferSize)
    debug("Accepted connection from %s on %s and assigned it to processor %d, sendBufferSize [actual|requested]: [%d|%d] recvBufferSize [actual|requested]: [%d|%d]"
          .format(socketChannel.socket.getRemoteSocketAddress, socketChannel.socket.getLocalSocketAddress, processor.id,
                socketChannel.socket.getSendBufferSize, sendBufferSize,
                socketChannel.socket.getReceiveBufferSize, recvBufferSize))
      //Processor处理I/O事件
      processor.accept(socketChannel)
  } catch {
    case e: TooManyConnectionsException =>
      info("Rejected connection from %s, address already has the configured maximum of %d connections.".format(e.ip, e.count))
      close(socketChannel)
  }
}

Processor的accept将socketChannel存放在ConcurrentLinkedQueue中。

/**
 * Queue up a new connection for reading
 */
def accept(socketChannel: SocketChannel) {
  newConnections.add(socketChannel)
  wakeup()
}

然后由Processor线程从队列中获取连接并交给RequestChannel处理。

override def run() {
  startupComplete()//CountDownLatch
  try {
    while (isRunning) {
      try {
          //从队列中取出连接
          configureNewConnections()
          //处理responseQueue
          processNewResponses()
          //selector.poll
          poll()
          //处理requestQueue
          processCompletedReceives()
          //移除inflightResponses
          processCompletedSends()
          //移除连接
          processDisconnected()
      } catch {
        // We catch all the throwables here to prevent the processor thread from exiting. We do this because
        // letting a processor exit might cause a bigger impact on the broker. This behavior might need to be
        // reviewed if we see an exception that needs the entire broker to stop. Usually the exceptions thrown would
        // be either associated with a specific socket channel or a bad request. These exceptions are caught and
        // processed by the individual methods above which close the failing channel and continue processing other
        // channels. So this catch block should only ever see ControlThrowables.
        case e: Throwable => processException("Processor got uncaught exception.", e)
      }
    }
  } finally {
    debug("Closing selector - processor " + id)
    CoreUtils.swallow(closeAll(), this, Level.ERROR)
    shutdownComplete()
  }
}

最后，由KafkaRequestHandlerPool实现的简单线程池启动的KafkaRequestHandler线程，不断从RequestChannel中的requestQueue获取请求，然后调用KafkaApis处理业务逻辑，再返回给RequestChannel的responseQueue。

class KafkaRequestHandlerPool(val brokerId: Int,
                              val requestChannel: RequestChannel,
                              val apis: KafkaApis,
                              time: Time,
                              numThreads: Int) extends Logging with KafkaMetricsGroup {
  //线程池大小	
  private val threadPoolSize: AtomicInteger = new AtomicInteger(numThreads)
  /* a meter to track the average free capacity of the request handlers */
  private val aggregateIdleMeter = newMeter("RequestHandlerAvgIdlePercent", "percent", TimeUnit.NANOSECONDS)

  this.logIdent = "[Kafka Request Handler on Broker " + brokerId + "], "
  val runnables = new mutable.ArrayBuffer[KafkaRequestHandler](numThreads)
  for (i <- 0 until numThreads) {
    createHandler(i)
  }
  
  //启动KafkaRequestHandler线程
  def createHandler(id: Int): Unit = synchronized {
    runnables += new KafkaRequestHandler(id, brokerId, aggregateIdleMeter, threadPoolSize, requestChannel, apis, time)
    KafkaThread.daemon("kafka-request-handler-" + id, runnables(id)).start()
  }

  def resizeThreadPool(newSize: Int): Unit = synchronized {
    val currentSize = threadPoolSize.get
    info(s"Resizing request handler thread pool size from $currentSize to $newSize")
    if (newSize > currentSize) {
      for (i <- currentSize until newSize) {
        createHandler(i)
      }
    } else if (newSize < currentSize) {
      for (i <- 1 to (currentSize - newSize)) {
        runnables.remove(currentSize - i).stop()
      }
    }
    threadPoolSize.set(newSize)
  }

  def shutdown(): Unit = synchronized {
    info("shutting down")
    for (handler <- runnables)
      handler.initiateShutdown()
    for (handler <- runnables)
      handler.awaitShutdown()
    info("shut down completely")
  }
}

/**
 * A thread that answers kafka requests.
 */
class KafkaRequestHandler(id: Int,
                          brokerId: Int,
                          val aggregateIdleMeter: Meter,
                          val totalHandlerThreads: AtomicInteger,
                          val requestChannel: RequestChannel,
                          apis: KafkaApis,
                          time: Time) extends Runnable with Logging {
  this.logIdent = "[Kafka Request Handler " + id + " on Broker " + brokerId + "], "
  private val shutdownComplete = new CountDownLatch(1)
  @volatile private var stopped = false

  def run() {
    while (!stopped) {
      // We use a single meter for aggregate idle percentage for the thread pool.
      // Since meter is calculated as total_recorded_value / time_window and
      // time_window is independent of the number of threads, each recorded idle
      // time should be discounted by # threads.
      val startSelectTime = time.nanoseconds
      //获取请求
      val req = requestChannel.receiveRequest(300)
      val endTime = time.nanoseconds
      val idleTime = endTime - startSelectTime
      aggregateIdleMeter.mark(idleTime / totalHandlerThreads.get)
      
      req match {
        case RequestChannel.ShutdownRequest =>
          debug(s"Kafka request handler $id on broker $brokerId received shut down command")
          shutdownComplete.countDown()
          return

        case request: RequestChannel.Request =>
          try {
            request.requestDequeueTimeNanos = endTime
            trace(s"Kafka request handler $id on broker $brokerId handling request $request")
            //由KafkaApis来处理业务逻辑
            apis.handle(request)
          } catch {
            case e: FatalExitError =>
              shutdownComplete.countDown()
              Exit.exit(e.statusCode)
            case e: Throwable => error("Exception when handling request", e)
          } finally {
            request.releaseBuffer()
          }

        case null => // continue
      }
    }
    shutdownComplete.countDown()
  }

  def stop(): Unit = {
    stopped = true
  }

  def initiateShutdown(): Unit = requestChannel.sendShutdownRequest()

  def awaitShutdown(): Unit = shutdownComplete.await()

}

KafkaApis的handle方法逻辑。

/**
  * Top-level method that handles all requests and multiplexes to the right api
  */
 def handle(request: RequestChannel.Request) {
   try {
     trace(s"Handling request:${request.requestDesc(true)} from connection ${request.context.connectionId};" +
       s"securityProtocol:${request.context.securityProtocol},principal:${request.context.principal}")
     request.header.apiKey match {
       case ApiKeys.PRODUCE => handleProduceRequest(request)
       case ApiKeys.FETCH => handleFetchRequest(request)
       case ApiKeys.LIST_OFFSETS => handleListOffsetRequest(request)
       case ApiKeys.METADATA => handleTopicMetadataRequest(request)
       case ApiKeys.LEADER_AND_ISR => handleLeaderAndIsrRequest(request)
       case ApiKeys.STOP_REPLICA => handleStopReplicaRequest(request)
       case ApiKeys.UPDATE_METADATA => handleUpdateMetadataRequest(request)
       case ApiKeys.CONTROLLED_SHUTDOWN => handleControlledShutdownRequest(request)
       case ApiKeys.OFFSET_COMMIT => handleOffsetCommitRequest(request)
       case ApiKeys.OFFSET_FETCH => handleOffsetFetchRequest(request)
       case ApiKeys.FIND_COORDINATOR => handleFindCoordinatorRequest(request)
       case ApiKeys.JOIN_GROUP => handleJoinGroupRequest(request)
       case ApiKeys.HEARTBEAT => handleHeartbeatRequest(request)
       case ApiKeys.LEAVE_GROUP => handleLeaveGroupRequest(request)
       case ApiKeys.SYNC_GROUP => handleSyncGroupRequest(request)
       case ApiKeys.DESCRIBE_GROUPS => handleDescribeGroupRequest(request)
       case ApiKeys.LIST_GROUPS => handleListGroupsRequest(request)
       case ApiKeys.SASL_HANDSHAKE => handleSaslHandshakeRequest(request)
       case ApiKeys.API_VERSIONS => handleApiVersionsRequest(request)
       case ApiKeys.CREATE_TOPICS => handleCreateTopicsRequest(request)
       case ApiKeys.DELETE_TOPICS => handleDeleteTopicsRequest(request)
       case ApiKeys.DELETE_RECORDS => handleDeleteRecordsRequest(request)
       case ApiKeys.INIT_PRODUCER_ID => handleInitProducerIdRequest(request)
       case ApiKeys.OFFSET_FOR_LEADER_EPOCH => handleOffsetForLeaderEpochRequest(request)
       case ApiKeys.ADD_PARTITIONS_TO_TXN => handleAddPartitionToTxnRequest(request)
       case ApiKeys.ADD_OFFSETS_TO_TXN => handleAddOffsetsToTxnRequest(request)
       case ApiKeys.END_TXN => handleEndTxnRequest(request)
       case ApiKeys.WRITE_TXN_MARKERS => handleWriteTxnMarkersRequest(request)
       case ApiKeys.TXN_OFFSET_COMMIT => handleTxnOffsetCommitRequest(request)
       case ApiKeys.DESCRIBE_ACLS => handleDescribeAcls(request)
       case ApiKeys.CREATE_ACLS => handleCreateAcls(request)
       case ApiKeys.DELETE_ACLS => handleDeleteAcls(request)
       case ApiKeys.ALTER_CONFIGS => handleAlterConfigsRequest(request)
       case ApiKeys.DESCRIBE_CONFIGS => handleDescribeConfigsRequest(request)
       case ApiKeys.ALTER_REPLICA_LOG_DIRS => handleAlterReplicaLogDirsRequest(request)
       case ApiKeys.DESCRIBE_LOG_DIRS => handleDescribeLogDirsRequest(request)
       case ApiKeys.SASL_AUTHENTICATE => handleSaslAuthenticateRequest(request)
       case ApiKeys.CREATE_PARTITIONS => handleCreatePartitionsRequest(request)
       case ApiKeys.CREATE_DELEGATION_TOKEN => handleCreateTokenRequest(request)
       case ApiKeys.RENEW_DELEGATION_TOKEN => handleRenewTokenRequest(request)
       case ApiKeys.EXPIRE_DELEGATION_TOKEN => handleExpireTokenRequest(request)
       case ApiKeys.DESCRIBE_DELEGATION_TOKEN => handleDescribeTokensRequest(request)
       case ApiKeys.DELETE_GROUPS => handleDeleteGroupsRequest(request)
     }
   } catch {
     case e: FatalExitError => throw e
     case e: Throwable => handleError(request, e)
   } finally {
     request.apiLocalCompleteTimeNanos = time.nanoseconds
   }
 }

每个业务处理完结果都存入RequestChannel中。

private def sendResponse(request: RequestChannel.Request, responseOpt: Option[AbstractResponse]): Unit = {
  // Update error metrics for each error code in the response including Errors.NONE
  responseOpt.foreach(response => requestChannel.updateErrorMetrics(request.header.apiKey, response.errorCounts.asScala))

  responseOpt match {
    case Some(response) =>
      val responseSend = request.context.buildResponse(response)
      val responseString =
        if (RequestChannel.isRequestLoggingEnabled) Some(response.toString(request.context.apiVersion))
        else None
      requestChannel.sendResponse(new RequestChannel.Response(request, Some(responseSend), SendAction, responseString))
    case None =>
      requestChannel.sendResponse(new RequestChannel.Response(request, None, NoOpAction, None))
  }
}

/** Send a response back to the socket server to be sent over the network */
def sendResponse(response: RequestChannel.Response) {
  if (isTraceEnabled) {
    val requestHeader = response.request.header
    val message = response.responseAction match {
      case SendAction =>
        s"Sending ${requestHeader.apiKey} response to client ${requestHeader.clientId} of ${response.responseSend.get.size} bytes."
      case NoOpAction =>
        s"Not sending ${requestHeader.apiKey} response to client ${requestHeader.clientId} as it's not required."
      case CloseConnectionAction =>
        s"Closing connection for client ${requestHeader.clientId} due to error during ${requestHeader.apiKey}."
    }
    trace(message)
  }

  val processor = processors.get(response.processor)
  // The processor may be null if it was shutdown. In this case, the connections
  // are closed, so the response is dropped.
  if (processor != null) {
    processor.enqueueResponse(response)
  }
}

private[network] def enqueueResponse(response: RequestChannel.Response): Unit = {
  responseQueue.put(response)
  wakeup()
}