Hadoop源码分析之_NameNode regular过程分析2

来源:互联网 发布:设a为3阶矩阵,|a|=2 编辑:程序博客网 时间:2024/06/08 00:40

参考http://www.cnblogs.com/ggjucheng/archive/2013/02/04/2889386.html

1、StringUtils.startupShutdownMessage(NameNode.class, argv, LOG);

入参:

public static final Log LOG = LogFactory.getLog(NameNode.class.getName());

{

final String hostname = getHostname(); //  InetAddress.getLocalHost() 程序中使用了较多的throws Exception。
    final String classname = clazz.getSimpleName();
    LOG.info(
        toStartupShutdownString("STARTUP_MSG: ", new String[] {
            "Starting " + classname,
            "  host = " + hostname,
            "  args = " + Arrays.asList(args),
            "  version = " + VersionInfo.getVersion(), //qes:需要分析下(何时初始化)
            "  build = " + VersionInfo.getUrl() + " -r "
                         + VersionInfo.getRevision() 
                         + "; compiled by '" + VersionInfo.getUser()
                         + "' on " + VersionInfo.getDate()}
        )
      );

    Runtime.getRuntime().addShutdownHook(new Thread() { //key:钩子
      public void run() {
        LOG.info(toStartupShutdownString("SHUTDOWN_MSG: ", new String[]{
          "Shutting down " + classname + " at " + hostname}));
      }
    });

}

 

private static String toStartupShutdownString(String prefix, String [] msg) {
    StringBuffer b = new StringBuffer(prefix);
    b.append("\n/************************************************************");
    for(String s : msg)
      b.append("\n" + prefix + s);
    b.append("\n************************************************************/");
    return b.toString();
  }

2、

 public static NameNode createNameNode(String argv[],
                                 Configuration conf) throws IOException {
    if (conf == null)
      conf = new Configuration();
    StartupOption startOpt = parseArguments(argv);
    if (startOpt == null) {
      printUsage();
      return null;
    }
    setStartupOption(conf, startOpt);

    switch (startOpt) {
      case FORMAT://首次启动namenode要格式化,或者是重新初始化namenode
        boolean aborted = format(conf, true);
        System.exit(aborted ? 1 : 0);
      case FINALIZE://完成升级hadoop,删除备份
        aborted = finalize(conf, true);
        System.exit(aborted ? 1 : 0);
      default:
    }
    DefaultMetricsSystem.initialize("NameNode");//qes:待研究

    NameNode namenode = new NameNode(conf);
    return namenode;
  }

2.1

private static void setStartupOption(Configuration conf, StartupOption opt) {
    conf.set("dfs.namenode.startup", opt.toString());
  }

2.2

DefaultMetricsSystem.initialize("NameNode");//qes:待研究 打印如下:

13/10/24 21:32:22 INFO impl.MetricsConfig: loaded properties from hadoop-metrics2.properties
13/10/24 21:32:22 INFO impl.MetricsSourceAdapter: MBean for source MetricsSystem,sub=Stats registered.
13/10/24 21:32:22 INFO impl.MetricsSystemImpl: Scheduled snapshot period at 10 second(s).
13/10/24 21:32:22 INFO impl.MetricsSystemImpl: NameNode metrics system started


2.3

 NameNode namenode = new NameNode(conf);

->

public NameNode(Configuration conf) throws IOException {
    try {
      initialize(conf);
    } catch (IOException e) {
      this.stop(); //key 待研究
      throw e;
    }
  }

->

/**
   * Initialize name-node.
   *
   * @param conf the configuration
   */
  private void initialize(Configuration conf) throws IOException {
    InetSocketAddress socAddr = NameNode.getAddress(conf);
    UserGroupInformation.setConfiguration(conf);
    SecurityUtil.login(conf, DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY,
        DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, socAddr.getHostName());
    int handlerCount = conf.getInt("dfs.namenode.handler.count", 10);
   
    // set service-level authorization security policy
    if (serviceAuthEnabled =
          conf.getBoolean(
            ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, false)) {
      ServiceAuthorizationManager.refresh(conf, new HDFSPolicyProvider());
    }
   
    myMetrics = NameNodeInstrumentation.create(conf);

//从fsimage和edits log加载元数据
    this.namesystem = new FSNamesystem(this, conf);

    if (UserGroupInformation.isSecurityEnabled()) {
      namesystem.activateSecretManager();
    }

    // create rpc server
    InetSocketAddress dnSocketAddr = getServiceRpcServerAddress(conf);
    if (dnSocketAddr != null) {
      int serviceHandlerCount =
        conf.getInt(DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_KEY,
                    DFSConfigKeys.DFS_NAMENODE_SERVICE_HANDLER_COUNT_DEFAULT);
      this.serviceRpcServer = RPC.getServer(this, dnSocketAddr.getHostName(),
          dnSocketAddr.getPort(), serviceHandlerCount,
          false, conf, namesystem.getDelegationTokenSecretManager());
      this.serviceRPCAddress = this.serviceRpcServer.getListenerAddress();
      setRpcServiceServerAddress(conf);
    }

//创建RPCServer,默认的rpc线程数是10,默认端口是8020
    this.server = RPC.getServer(this, socAddr.getHostName(),
        socAddr.getPort(), handlerCount, false, conf, namesystem
        .getDelegationTokenSecretManager());

    // The rpc-server port can be ephemeral... ensure we have the correct info
    this.serverAddress = this.server.getListenerAddress();
    FileSystem.setDefaultUri(conf, getUri(serverAddress));
    LOG.info("Namenode up at: " + this.serverAddress);

   

    startHttpServer(conf);//启动http服务器,启动后可以通过http://namenode:50070 访问hdfs的管理页面
    this.server.start();  //start RPC server  
    if (serviceRpcServer != null) {
      serviceRpcServer.start();     
    }
    startTrashEmptier(conf);启动回收站清理线程,将过期的已删除文件,真正删除。
  }

 

NameNode的启动流程最复杂的就是FSNamesystem的初始化了,这个类是NameNode启动的核心逻辑,而其他启动逻辑都比较好懂。可以自行查看代码。

org.apache.hadoop.hdfs.server.namenode.FSNamesystem具备了Namenode所提供基本服务的基础上,也可以料想到它实现的复杂性。

public class FSNamesystem {

  //存储文件树
  public FSDirectory dir;

//BlocksMap类维护块(Block)到其元数据的映射表,元数据信息包括块所属的inode、存储块的Datanode。

  final BlocksMap blocksMap = new BlocksMap(DEFAULT_INITIAL_MAP_CAPACITY,DEFAULT_MAP_LOAD_FACTOR);
//失效块的映射表。

  public CorruptReplicasMap corruptReplicas = new CorruptReplicasMap();
  //datanode到块的映射
  NavigableMap<String, DatanodeDescriptor> datanodeMap = new TreeMap<String, DatanodeDescriptor>();
 //datanodeMap的子集,只包含认为存活的DatanodeDescriptor,HeartbeatMonitor会定期清除过期的元素
  ArrayList<DatanodeDescriptor> heartbeats = new ArrayList<DatanodeDescriptor>();

  //描述某些块的副本数量不足块的实体类,而且,对于块设定了优先级,通过一个优先级队列来管理块副本不足的块的集合
  private UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();

  //描述当前尚未完成块副本复制的块的列表。
  private PendingReplicationBlocks pendingReplications;

  //对文件的租约进行管理
  public LeaseManager leaseManager = new LeaseManager(this);

  Daemon hbthread = null;   // 周期性地调用FSNamesystem类定义的heartbeatCheck方法,来监视Datanode结点发送的心跳状态信息,并做出处理
  public Daemon lmthread = null;   // LeaseMonitor thread
  Daemon smmthread = null;  // 用来周期性地检查是否达到离开安全模式的条件,因此,该线程必须在进入安全模式之后启动(也就是达到threshold)。
  public Daemon replthread = null;  // 周期性调用两个方法:计算块副本数量,以制定计划并调度Datanode处理 ;处理未完成块的流水线复制的副本 

  private ReplicationMonitor replmon = null; // Replication metrics
  
  //用来保存Datanode结点的主机 -> DatanodeDescriptor数组的映射
  private Host2NodesMap host2DataNodeMap = new Host2NodesMap();


  //表示一个具有树状网络拓扑结构的计算机集群,例如,一个集群可能由多个数据中心(Data Center)组成,在这些数据中心分布着为计算需求而设置的很多计算机的机架(Rack)。
  NetworkTopology clusterMap = new NetworkTopology();

  //该接口是一个支持插件的定义,通过插件定义DNS-name/IP-address -> RackID之间转换的解析器。
  private DNSToSwitchMapping dnsToSwitchMapping;

  //对指定的块副本的存放位置进行定位选择的实现类。
  ReplicationTargetChooser replicator;
  //用来跟踪Datanode的,哪些Datanode允许连接到Namenode,哪些不能够连接到Namenode,都在该类中指定的列表中记录着
 private HostsFileReader hostsReader;
}

 

 2.3.1

FSNamesystem

/**
   * FSNamesystem constructor.
   */
  FSNamesystem(NameNode nn, Configuration conf) throws IOException {
    try {
      initialize(nn, conf);
    } catch(IOException e) {
      LOG.error(getClass().getSimpleName() + " initialization failed.", e);
      close();
      throw e;
    }
  }

 /**
   * Initialize FSNamesystem.
   */
  private void initialize(NameNode nn, Configuration conf) throws IOException {
    this.systemStart = now();
    setConfigurationParameters(conf);
    dtSecretManager = createDelegationTokenSecretManager(conf);

    this.nameNodeAddress = nn.getNameNodeAddress();
    this.registerMBean(conf); // register the MBean for the FSNamesystemStutus //key
    this.dir = new FSDirectory(this, conf);
    StartupOption startOpt = NameNode.getStartupOption(conf);
    this.dir.loadFSImage(getNamespaceDirs(conf),
                         getNamespaceEditsDirs(conf), startOpt);
    long timeTakenToLoadFSImage = now() - systemStart;
    LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
    NameNode.getNameNodeMetrics().setFsImageLoadTime(timeTakenToLoadFSImage);
    this.safeMode = new SafeModeInfo(conf);
    setBlockTotal();
    pendingReplications = new PendingReplicationBlocks(
                            conf.getInt("dfs.replication.pending.timeout.sec",
                                        -1) * 1000L);
    if (isAccessTokenEnabled) {
      accessTokenHandler = new BlockTokenSecretManager(true,
          accessKeyUpdateInterval, accessTokenLifetime);
    }
    this.hbthread = new Daemon(new HeartbeatMonitor());//监视Datanode结点发送的心跳状态信息的后台线程
    this.lmthread = new Daemon(leaseManager.new Monitor());//对文件的租约进行管理后台线程
    this.replmon = new ReplicationMonitor();
    this.replthread = new Daemon(replmon);//处理未完成块的流水线复制的副本
    hbthread.start();
    lmthread.start();
    replthread.start();

//从配置文件读取datanode的黑白名单

    this.hostsReader = new HostsFileReader(conf.get("dfs.hosts",""),
                                           conf.get("dfs.hosts.exclude",""));

//处理退役节点,一般会把退役节点的块做迁移
    this.dnthread = new Daemon(new DecommissionManager(this).new Monitor(
        conf.getInt("dfs.namenode.decommission.interval", 30),
        conf.getInt("dfs.namenode.decommission.nodes.per.interval", 5)));
    dnthread.start();

    this.dnsToSwitchMapping = ReflectionUtils.newInstance(
        conf.getClass("topology.node.switch.mapping.impl", ScriptBasedMapping.class,
            DNSToSwitchMapping.class), conf);
   
    /* If the dns to swith mapping supports cache, resolve network
     * locations of those hosts in the include list,
     * and store the mapping in the cache; so future calls to resolve
     * will be fast.
     */
    if (dnsToSwitchMapping instanceof CachedDNSToSwitchMapping) {
      dnsToSwitchMapping.resolve(new ArrayList<String>(hostsReader.getHosts()));
    }
   
    InetSocketAddress socAddr = NameNode.getAddress(conf);
    this.nameNodeHostName = socAddr.getHostName();
   
    registerWith(DefaultMetricsSystem.INSTANCE);
  }

2.3.1.2

/**
   * Initializes some of the members from configuration
   */
  private void setConfigurationParameters(Configuration conf)
                                          throws IOException {
    fsNamesystemObject = this;
    fsOwner = UserGroupInformation.getCurrentUser(); //key:待研究
    LOG.info("fsOwner=" + fsOwner);

    this.supergroup = conf.get("dfs.permissions.supergroup", "supergroup");
    this.isPermissionEnabled = conf.getBoolean("dfs.permissions", true);
    LOG.info("supergroup=" + supergroup);
    LOG.info("isPermissionEnabled=" + isPermissionEnabled);
    short filePermission = (short)conf.getInt("dfs.upgrade.permission", 0777);
    this.defaultPermission = PermissionStatus.createImmutable(
        fsOwner.getShortUserName(), supergroup, new FsPermission(filePermission));


    this.replicator = new ReplicationTargetChooser(
                         conf.getBoolean("dfs.replication.considerLoad", true),
                         this,
                         clusterMap);
    this.defaultReplication = conf.getInt("dfs.replication", 3);
    this.maxReplication = conf.getInt("dfs.replication.max", 512);
    this.minReplication = conf.getInt("dfs.replication.min", 1);
    if (minReplication <= 0)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.min = "
                            + minReplication
                            + " must be greater than 0");
    if (maxReplication >= (int)Short.MAX_VALUE)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.max = "
                            + maxReplication + " must be less than " + (Short.MAX_VALUE));
    if (maxReplication < minReplication)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.min = "
                            + minReplication
                            + " must be less than dfs.replication.max = "
                            + maxReplication);
    this.maxReplicationStreams = conf.getInt("dfs.max-repl-streams", 2);
    long heartbeatInterval = conf.getLong("dfs.heartbeat.interval", 3) * 1000;
    this.heartbeatRecheckInterval = conf.getInt(
        "heartbeat.recheck.interval", 5 * 60 * 1000); // 5 minutes
    this.heartbeatExpireInterval = 2 * heartbeatRecheckInterval +
      10 * heartbeatInterval;
    this.replicationRecheckInterval =
      conf.getInt("dfs.replication.interval", 3) * 1000L;
    this.defaultBlockSize = conf.getLong("dfs.block.size", DEFAULT_BLOCK_SIZE);
    this.maxFsObjects = conf.getLong("dfs.max.objects", 0);

    //default limit
    this.blockInvalidateLimit = Math.max(this.blockInvalidateLimit,
                                         20*(int)(heartbeatInterval/1000));
    //use conf value if it is set.
    this.blockInvalidateLimit = conf.getInt(
        DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY, this.blockInvalidateLimit);
    LOG.info(DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_KEY + "=" + this.blockInvalidateLimit);

    this.accessTimePrecision = conf.getLong("dfs.access.time.precision", 0);
    this.supportAppends = conf.getBoolean("dfs.support.append", false);
    this.isAccessTokenEnabled = conf.getBoolean(
        DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, false);
    if (isAccessTokenEnabled) {
      this.accessKeyUpdateInterval = conf.getLong(
          DFSConfigKeys.DFS_BLOCK_ACCESS_KEY_UPDATE_INTERVAL_KEY, 600) * 60 * 1000L; // 10 hrs
      this.accessTokenLifetime = conf.getLong(
          DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_LIFETIME_KEY, 600) * 60 * 1000L; // 10 hrs
    }
    LOG.info("isAccessTokenEnabled=" + isAccessTokenEnabled
        + " accessKeyUpdateInterval=" + accessKeyUpdateInterval / (60 * 1000)
        + " min(s), accessTokenLifetime=" + accessTokenLifetime / (60 * 1000)
        + " min(s)");
  }

打印台输出:

13/10/24 22:59:35 INFO namenode.FSNamesystem: fsOwner=user //linux用户
13/10/24 22:59:35 INFO namenode.FSNamesystem: supergroup=supergroup ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: isPermissionEnabled=true ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100 ?
13/10/24 22:59:35 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)

2.3.1.3

/*
   * Delegation Token   ????
   */
 
  private DelegationTokenSecretManager createDelegationTokenSecretManager(
      Configuration conf) {
    return new DelegationTokenSecretManager(conf.getLong(
        "dfs.namenode.delegation.key.update-interval", 24*60*60*1000),
        conf.getLong(
            "dfs.namenode.delegation.token.max-lifetime", 7*24*60*60*1000),
        conf.getLong(
            "dfs.namenode.delegation.token.renew-interval", 24*60*60*1000),
        DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL, this);
  }

2.3.1.4

2.3.1.5

FSDirectory类是用来存储文件系统目录的状态。它处理向磁盘中写入或加载数据,并且对目录中的数据发生的改变记录到日志中。它保存了一个最新的filename->blockset的映射表,并且将它写入到磁盘中。它的主要功能实现是成员FSImage fsImage完成。

this.dir = new FSDirectory(this, conf);

FSDirectory

 /** Access an existing dfs name directory. */
  FSDirectory(FSNamesystem ns, Configuration conf) {
    this(new FSImage(), ns, conf);
    fsImage.setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null),
                                FSImage.getCheckpointEditsDirs(conf, null));
  }

-->

FSDirectory(FSImage fsImage, FSNamesystem ns, Configuration conf) {
    rootDir = new INodeDirectoryWithQuota(INodeDirectory.ROOT_NAME,
        ns.createFsOwnerPermissions(new FsPermission((short)0755)),   //    "":user:supergroup:rwxr-xr-x
        Integer.MAX_VALUE, -1);
    this.fsImage = fsImage;
    fsImage.setRestoreRemovedDirs(conf.getBoolean(
        DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
        DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT));
    namesystem = ns;
    int configuredLimit = conf.getInt(
        DFSConfigKeys.DFS_LIST_LIMIT, DFSConfigKeys.DFS_LIST_LIMIT_DEFAULT);
    this.lsLimit = configuredLimit>0 ?
        configuredLimit : DFSConfigKeys.DFS_LIST_LIMIT_DEFAULT;
   
    int threshold = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_NAME_CACHE_THRESHOLD_KEY,
        DFSConfigKeys.DFS_NAMENODE_NAME_CACHE_THRESHOLD_DEFAULT);
    NameNode.LOG.info("Caching file names occuring more than " + threshold
        + " times ");
    nameCache = new NameCache<ByteArray>(threshold);

  }

[/usr/local/hadoop/tmp/dfs/namesecondary]      [/usr/local/hadoop/tmp/dfs/namesecondary]

FSImage.getCheckpointEditsDirs(conf, null)       [/usr/local/hadoop/tmp/dfs/namesecondary] 

2.3.1.6

 

2.3.1.6.1

//FSNamesystem在初始化完FSDirectory dir成员,会调用loadFSImage方法,从fsimage和edits加载元数据信息

FSNameSystem : this.dir.loadFSImage(getNamespaceDirs(conf),getNamespaceEditsDirs(conf), startOpt);

 

public static Collection<File> getNamespaceDirs(Configuration conf) {
    Collection<String> dirNames = conf.getStringCollection("dfs.name.dir");  //[/usr/local/hadoop/tmp/dfs/name]
    if (dirNames.isEmpty())
      dirNames.add("/tmp/hadoop/dfs/name");
    Collection<File> dirs = new ArrayList<File>(dirNames.size());
    for(String name : dirNames) {
      dirs.add(new File(name));
    }
    return dirs;
  }

->

/**
   * Get the comma delimited values of the <code>name</code> property as
   * a collection of <code>String</code>s. 
   * If no such property is specified then empty collection is returned.
   * <p>
   * This is an optimized version of {@link #getStrings(String)}
   *
   * @param name property name.
   * @return property value as a collection of <code>String</code>s.
   */
  public Collection<String> getStringCollection(String name) {
    String valueString = get(name);  ///usr/local/hadoop/tmp/dfs/name
    return StringUtils.getStringCollection(valueString);
  }

StringUtils.java 学习

/**
   * Returns a collection of strings.
   * @param str comma seperated string values
   * @return an <code>ArrayList</code> of string values
   */
  public static Collection<String> getStringCollection(String str){
    List<String> values = new ArrayList<String>();
    if (str == null)
      return values;
    StringTokenizer tokenizer = new StringTokenizer (str,",");
    values = new ArrayList<String>();
    while (tokenizer.hasMoreTokens()) {
      values.add(tokenizer.nextToken());
    }
    return values;
  }

 

public static Collection<File> getNamespaceEditsDirs(Configuration conf) {
    Collection<String> editsDirNames =
            conf.getStringCollection("dfs.name.edits.dir");  //[/usr/local/hadoop/tmp/dfs/name]
    if (editsDirNames.isEmpty())
      editsDirNames.add("/tmp/hadoop/dfs/name");
    Collection<File> dirs = new ArrayList<File>(editsDirNames.size());
    for(String name : editsDirNames) {
      dirs.add(new File(name));
    }
    return dirs;
  }

2.3.1.6.2

void loadFSImage(Collection<File> dataDirs,
                   Collection<File> editsDirs,
                   StartupOption startOpt) throws IOException {
    // format before starting up if requested
    if (startOpt == StartupOption.FORMAT) {// 如果启动选项类型为FORMAT(格式化),在启动之前需要进行格式化
      fsImage.setStorageDirectories(dataDirs, editsDirs);// 设置FSImage映像文件文件的存储目录:${dfs.name.dir},默认是/tmp/hadoop/dfs/name,是一个目录数组。
      fsImage.format();
      startOpt = StartupOption.REGULAR;
    }
    try {
      if (fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)) {// 根据启动选项及其对应存储目录(${dfs.name.dir}),分析存储目录,必要的话从先前的事务恢复过来
        fsImage.saveNamespace(true);
      }
      FSEditLog editLog = fsImage.getEditLog();
      assert editLog != null : "editLog must be initialized";
      if (!editLog.isOpen())
        editLog.open();
      fsImage.setCheckpointDirectories(null, null);
    } catch(IOException e) {
      fsImage.close();
      throw e;
    }
    synchronized (this) {
      this.ready = true;
      this.nameCache.initialized();
      this.notifyAll();
    }
  }

通过loadFSImage方法,我们可以看到加载一个FSImage映像的过程:首先需要对内存中的FSImage对象进行格式化;然后从将指定存储目录中的EditLog日志文件作用到格式化完成的FSImage内存映像上;最后需要再创建一个空的EditLog日志准备记录对命名空间进行修改的操作,以备检查点进程根据需要将EditLog内容作用到FSImage映像上,保持FSImage总是最新的,保证EditLog与FSImage同步。 

FSDirectory的更多分析参考 http://blog.csdn.net/shirdrn/article/details/4631518

总结

上面将了namenode相关的核心类的成员和初始化流程,这里总结下namenode的代码调用逻辑:

hdfs的目录和文件的创建,删除,还有文件的读写,追加,都是客户端通过rpc,调用namenode的接口。

接着namenode调用成员FSNamesystem namesystem完成文件的操作,namesystem会做租约的管理,网络拓扑的控制,文件权限的控制等。

接着namesystem调用成员FSDirectory dir操作,dir会做文件名到文件块的映射管理。

接着dir调用成员FSImage fsImage操作,fsImage会hdfs的所有变化,追加写入了EditLog,做持久化。

Secondrary Namenoe会定时(默认是一个小时)把namenode的EditLog和fsimage合并为一个fsimage,减少EditLog的文件大小。

本文只讲解namenode核心类的职责和调用逻辑,细节请自行查看hadoop的相关源码。

 

2.3.1.6.2.1 fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)

 

/**
   * Analyze storage directories.
   * Recover from previous transitions if required.
   * Perform fs state transition if necessary depending on the namespace info.
   * Read storage info.
   *
   * @param dataDirs
   * @param startOpt startup option
   * @throws IOException
   * @return true if the image needs to be saved or false otherwise
   */
  boolean recoverTransitionRead(Collection<File> dataDirs,
                             Collection<File> editsDirs,
                                StartupOption startOpt
                                ) throws IOException {
    assert startOpt != StartupOption.FORMAT :
      "NameNode formatting should be performed before reading the image";
   
    // none of the data dirs exist
    if (dataDirs.size() == 0 || editsDirs.size() == 0) 
      throw new IOException(
        "All specified directories are not accessible or do not exist.");
   
    if(startOpt == StartupOption.IMPORT
        && (checkpointDirs == null || checkpointDirs.isEmpty()))
      throw new IOException("Cannot import image from a checkpoint. "
                          + "\"fs.checkpoint.dir\" is not set." );

    if(startOpt == StartupOption.IMPORT
        && (checkpointEditsDirs == null || checkpointEditsDirs.isEmpty()))
      throw new IOException("Cannot import image from a checkpoint. "
                          + "\"fs.checkpoint.edits.dir\" is not set." );
   
    setStorageDirectories(dataDirs, editsDirs);
    // 1. For each data directory calculate its state and
    // check whether all is consistent before transitioning.
    Map<StorageDirectory, StorageState> dataDirStates =
             new HashMap<StorageDirectory, StorageState>();
    boolean isFormatted = false;
    for (Iterator<StorageDirectory> it =
                      dirIterator(); it.hasNext();) {
      StorageDirectory sd = it.next();
      StorageState curState;
      try {
        curState = sd.analyzeStorage(startOpt);
        // sd is locked but not opened
        switch(curState) {
        case NON_EXISTENT:
          // name-node fails if any of the configured storage dirs are missing
          throw new InconsistentFSStateException(sd.getRoot(),
                                                 "storage directory does not exist or is not accessible.");
        case NOT_FORMATTED:
          break;
        case NORMAL:
          break;
        default:  // recovery is possible
          sd.doRecover(curState);     
        }
        if (curState != StorageState.NOT_FORMATTED
            && startOpt != StartupOption.ROLLBACK) {
          sd.read(); // read and verify consistency with other directories
          isFormatted = true;
        }
        if (startOpt == StartupOption.IMPORT && isFormatted)
          // import of a checkpoint is allowed only into empty image directories
          throw new IOException("Cannot import image from a checkpoint. "
              + " NameNode already contains an image in " + sd.getRoot());
      } catch (IOException ioe) {
        sd.unlock();
        throw ioe;
      }
      dataDirStates.put(sd,curState);
    }
   
    if (!isFormatted && startOpt != StartupOption.ROLLBACK
                     && startOpt != StartupOption.IMPORT)
      throw new IOException("NameNode is not formatted.");
    if (layoutVersion < LAST_PRE_UPGRADE_LAYOUT_VERSION) {
      checkVersionUpgradable(layoutVersion);
    }
    if (startOpt != StartupOption.UPGRADE
          && layoutVersion < LAST_PRE_UPGRADE_LAYOUT_VERSION
          && layoutVersion != FSConstants.LAYOUT_VERSION)
        throw new IOException(
                          "\nFile system image contains an old layout version " + layoutVersion
                          + ".\nAn upgrade to version " + FSConstants.LAYOUT_VERSION
                          + " is required.\nPlease restart NameNode with -upgrade option.");
    // check whether distributed upgrade is reguired and/or should be continued
    verifyDistributedUpgradeProgress(startOpt);

    // 2. Format unformatted dirs.
    this.checkpointTime = 0L;
    for (Iterator<StorageDirectory> it =
                     dirIterator(); it.hasNext();) {
      StorageDirectory sd = it.next();
      StorageState curState = dataDirStates.get(sd);
      switch(curState) {
      case NON_EXISTENT:
        assert false : StorageState.NON_EXISTENT + " state cannot be here";
      case NOT_FORMATTED:
        LOG.info("Storage directory " + sd.getRoot() + " is not formatted.");
        LOG.info("Formatting ...");
        sd.clearDirectory(); // create empty currrent dir
        break;
      default:
        break;
      }
    }

    // 3. Do transitions
    switch(startOpt) {
    case UPGRADE:
      doUpgrade();
      return false; // upgrade saved image already
    case IMPORT:
      doImportCheckpoint();
      return true;
    case ROLLBACK:
      doRollback();
      break;
    case REGULAR:
      // just load the image
    }
    return loadFSImage();
  }

2.3.1.6.2.1.1

void setStorageDirectories(Collection<File> fsNameDirs,
                        Collection<File> fsEditsDirs
                             ) throws IOException {
    storageDirs = new ArrayList<StorageDirectory>();
    removedStorageDirs = new ArrayList<StorageDirectory>();
    // Add all name dirs with appropriate NameNodeDirType
    for (File dirName : fsNameDirs) {
      boolean isAlsoEdits = false;
      for (File editsDirName : fsEditsDirs) {
        if (editsDirName.compareTo(dirName) == 0) {
          isAlsoEdits = true;
          fsEditsDirs.remove(editsDirName);
          break;
        }
      }
      NameNodeDirType dirType = (isAlsoEdits) ?
                          NameNodeDirType.IMAGE_AND_EDITS :
                          NameNodeDirType.IMAGE;
      addStorageDir(new StorageDirectory(dirName, dirType));
    }

 // Add edits dirs if they are different from name dirs
    for (File dirName : fsEditsDirs) {
      addStorageDir(new StorageDirectory(dirName, NameNodeDirType.EDITS));
    }
  }

protected void addStorageDir(StorageDirectory sd) {
    storageDirs.add(sd);
  }

2.3.1.6.2.1.2

Storage.java

storage.java

storage2

/**
     * Check consistency of the storage directory
     *
     * @param startOpt a startup option.
     * 
     * @return state {@link StorageState} of the storage directory
     * @throws InconsistentFSStateException if directory state is not
     * consistent and cannot be recovered.
     * @throws IOException
     */
    public StorageState analyzeStorage(StartupOption startOpt) throws IOException {
      assert root != null : "root is null";
      String rootPath = root.getCanonicalPath();  //usr/local/hadoop/tmp/dfs/name
      try { // check that storage exists
        if (!root.exists()) {
          // storage directory does not exist
          if (startOpt != StartupOption.FORMAT) {
            LOG.info("Storage directory " + rootPath + " does not exist.");
            return StorageState.NON_EXISTENT;
          }
          LOG.info(rootPath + " does not exist. Creating ...");
          if (!root.mkdirs())
            throw new IOException("Cannot create directory " + rootPath);
        }
        // or is inaccessible
        if (!root.isDirectory()) {
          LOG.info(rootPath + "is not a directory.");
          return StorageState.NON_EXISTENT;
        }
        if (!root.canWrite()) {
          LOG.info("Cannot access storage directory " + rootPath);
          return StorageState.NON_EXISTENT;
        }
      } catch(SecurityException ex) {
        LOG.info("Cannot access storage directory " + rootPath, ex);
        return StorageState.NON_EXISTENT;
      }

      this.lock(); // lock storage if it exists

      if (startOpt == HdfsConstants.StartupOption.FORMAT)
        return StorageState.NOT_FORMATTED;
      if (startOpt != HdfsConstants.StartupOption.IMPORT) {
        //make sure no conversion is required
        checkConversionNeeded(this); //ques:有什么用?
      }

      // check whether current directory is valid
      File versionFile = getVersionFile();  ///usr/local/hadoop/tmp/dfs/name/current/VERSIO
      boolean hasCurrent = versionFile.exists();

      // check which directories exist  在regular调试下都false

      boolean hasPrevious = getPreviousDir().exists();  // /usr/local/hadoop/tmp/dfs/name/previous
      boolean hasPreviousTmp = getPreviousTmp().exists(); // /usr/local/hadoop/tmp/dfs/name/previous.tmp
      boolean hasRemovedTmp = getRemovedTmp().exists(); // /usr/local/hadoop/tmp/dfs/name/removed.tmp
      boolean hasFinalizedTmp = getFinalizedTmp().exists();
      boolean hasCheckpointTmp = getLastCheckpointTmp().exists();

      if (!(hasPreviousTmp || hasRemovedTmp
          || hasFinalizedTmp || hasCheckpointTmp)) {
        // no temp dirs - no recovery
        if (hasCurrent)
          return StorageState.NORMAL;
        if (hasPrevious)
          throw new InconsistentFSStateException(root,
                              "version file in current directory is missing.");
        return StorageState.NOT_FORMATTED;
      }

      if ((hasPreviousTmp?1:0) + (hasRemovedTmp?1:0)
          + (hasFinalizedTmp?1:0) + (hasCheckpointTmp?1:0) > 1)
        // more than one temp dirs
        throw new InconsistentFSStateException(root,
                                               "too many temporary directories.");

      // # of temp dirs == 1 should either recover or complete a transition
      if (hasCheckpointTmp) {
        return hasCurrent ? StorageState.COMPLETE_CHECKPOINT
                          : StorageState.RECOVER_CHECKPOINT;
      }

      if (hasFinalizedTmp) {
        if (hasPrevious)
          throw new InconsistentFSStateException(root,
                                                 STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_FINALIZED
                                                 + "cannot exist together.");
        return StorageState.COMPLETE_FINALIZE;
      }

      if (hasPreviousTmp) {
        if (hasPrevious)
          throw new InconsistentFSStateException(root,
                                                 STORAGE_DIR_PREVIOUS + " and " + STORAGE_TMP_PREVIOUS
                                                 + " cannot exist together.");
        if (hasCurrent)
          return StorageState.COMPLETE_UPGRADE;
        return StorageState.RECOVER_UPGRADE;
      }
     
      assert hasRemovedTmp : "hasRemovedTmp must be true";
      if (!(hasCurrent ^ hasPrevious))
        throw new InconsistentFSStateException(root,
                                               "one and only one directory " + STORAGE_DIR_CURRENT
                                               + " or " + STORAGE_DIR_PREVIOUS
                                               + " must be present when " + STORAGE_TMP_REMOVED
                                               + " exists.");
      if (hasCurrent)
        return StorageState.COMPLETE_ROLLBACK;
      return StorageState.RECOVER_ROLLBACK;
    }

 

/**
     * Lock storage to provide exclusive access.
     *
     * <p> Locking is not supported by all file systems.
     * E.g., NFS does not consistently support exclusive locks.
     *
     * <p> If locking is supported we guarantee exculsive access to the
     * storage directory. Otherwise, no guarantee is given.
     *
     * @throws IOException if locking fails
     */
    public void lock() throws IOException {
      this.lock = tryLock();
      if (lock == null) {
        String msg = "Cannot lock storage " + this.root
          + ". The directory is already locked.";
        LOG.info(msg);
        throw new IOException(msg);
      }
    }

 

/**
     * Attempts to acquire an exclusive lock on the storage.
     *
     * @return A lock object representing the newly-acquired lock or
     * <code>null</code> if storage is already locked.
     * @throws IOException if locking fails.
     */
    FileLock tryLock() throws IOException {
      File lockF = new File(root, STORAGE_FILE_LOCK);
      lockF.deleteOnExit();
      RandomAccessFile file = new RandomAccessFile(lockF, "rws");
      FileLock res = null;
      try {
        res = file.getChannel().tryLock();
      } catch(OverlappingFileLockException oe) {
        file.close();
        return null;
      } catch(IOException e) {
        LOG.error("Cannot create lock on " + lockF, e);
        file.close();
        throw e;
      }
      return res;
    }

 

2.3.1.6.2.1.3

FSImage  sd.read(); // read and verify consistency with other directories

enum NameNodeFile {
    IMAGE     ("fsimage"),
    TIME      ("fstime"),
    EDITS     ("edits"),
    IMAGE_NEW ("fsimage.ckpt"),
    EDITS_NEW ("edits.new");
   
    private String fileName = null;
    private NameNodeFile(String name) {this.fileName = name;}
    String getName() {return fileName;}
  }

下面是调用的Storage的代码

/**
     * Read version file.
     *
     * @throws IOException if file cannot be read or contains inconsistent data
     */
    public void read() throws IOException {
      read(getVersionFile());
    }
   
    public void read(File from) throws IOException {
      RandomAccessFile file = new RandomAccessFile(from, "rws");  /usr/local/hadoop/tmp/dfs/name/current/VERSION
      FileInputStream in = null;
      try {
        in = new FileInputStream(file.getFD());
        file.seek(0);
        Properties props = new Properties();
        props.load(in);
        getFields(props, this);
      } finally {
        if (in != null) {
          in.close();
        }
        file.close();
      }
    }

 protected void getFields(Properties props,
                           StorageDirectory sd
                           ) throws IOException {
    super.getFields(props, sd);
    if (layoutVersion == 0)
      throw new IOException("NameNode directory "
                            + sd.getRoot() + " is not formatted.");
    String sDUS, sDUV;
    sDUS = props.getProperty("distributedUpgradeState");  null
    sDUV = props.getProperty("distributedUpgradeVersion"); null
    setDistributedUpgradeState(
        sDUS == null? false : Boolean.parseBoolean(sDUS),
        sDUV == null? getLayoutVersion() : Integer.parseInt(sDUV));
    this.checkpointTime = readCheckpointTime(sd);
  }

/**
   * Get common storage fields.
   * Should be overloaded if additional fields need to be get.
   *
   * @param props
   * @throws IOException
   */
  protected void getFields(Properties props,
                           StorageDirectory sd
                           ) throws IOException {
    String sv, st, sid, sct;
    sv = props.getProperty("layoutVersion");
    st = props.getProperty("storageType");
    sid = props.getProperty("namespaceID");
    sct = props.getProperty("cTime");
    if (sv == null || st == null || sid == null || sct == null)
      throw new InconsistentFSStateException(sd.root,
                                             "file " + STORAGE_FILE_VERSION + " is invalid.");
    int rv = Integer.parseInt(sv);
    NodeType rt = NodeType.valueOf(st);
    int rid = Integer.parseInt(sid);
    long rct = Long.parseLong(sct);
    if (!storageType.equals(rt) ||
        !((namespaceID == 0) || (rid == 0) || namespaceID == rid))
      throw new InconsistentFSStateException(sd.root,
                                             "is incompatible with others.");
    if (rv < FSConstants.LAYOUT_VERSION) // future version
      throw new IncorrectVersionException(rv, "storage directory "
                                          + sd.root.getCanonicalPath());
    layoutVersion = rv;
    storageType = rt;
    namespaceID = rid;
    cTime = rct;
  }

 

long readCheckpointTime(StorageDirectory sd) throws IOException {
    File timeFile = getImageFile(sd, NameNodeFile.TIME); // /usr/local/hadoop/tmp/dfs/name/current/fstime
    long timeStamp = 0L;
    if (timeFile.exists() && timeFile.canRead()) {
      DataInputStream in = new DataInputStream(new FileInputStream(timeFile));
      try {
        timeStamp = in.readLong();
      } catch (IOException e) {
        LOG.info("Could not read fstime file in storage directory " + sd, e);
      } finally {
        in.close();
      }
    }
    return timeStamp;
  }