HDFS的Java API

hdfs文件读取流程">hdfs文件读取流程

client调用FileSystem.open()方法
1. FileSystem通过RPC与NN通信，NN返回该文件的部分或全部block列表（含有block拷贝的DN地址）
2. 选取距离客户端最近的DN建立连接，读取block，返回FSDataInputStream
client调用输入流的read()方法
1. 当读取到block结尾时，FSDataInputStream关闭与当前DN的连接，并为读取下一个block寻找最近DN
2. 读取完一个block都会进行checksum验证，如果读取DN时出现错误，客户端会通知NN，然后再从下一个拥有该block拷贝的DN继续读
3. 如果block列表读完后，文件还未结束，FileSystem会继续从NN获取下一批block列表
关闭FSDataInputStream

hdfs文件写入流程">hdfs文件写入流程

client调用FileSystem的create()方法
1. FileSystem向NN发出请求，在NN的namespace里面创建一新文件，但是并不关联任何块
2. NN检查文件是否已存在、操作权限。如果检查通过，NN记录新文件信息，并在某一个DN上创建数据块
3. 返回FSDataOutStream，将client引导至该数据块执行写入操作
client调用输入流的write()方法
HDFS默认FSDataOutStream将数据首先写到第一节点，第一节点将数据包传送并写入第二节点，依次进行
client调用流的close()方法
flush缓冲区的数据包，block完成复制份数后，NN返回成功消息

hdfs内文件内容">url方式读取HDFS内文件内容

static {
    // 让Java程序识别HDFS的URL
    URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
}

public void urlway(String fileurl) {
    InputStream in = null;
    try {
        in = new URL(fileurl).openStream();
        // 4096是复制缓冲区的大小
        IOUtils.copyBytes(in, System.out, 4096);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        IOUtils.closeStream(in);
    }
}

// 调用
urlway("hdfs://localhost.localdomain:9000/in/dir/0.log");

用FileSystem类操作

package hadoop.examples;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FsUrlStreamHandlerFactory;

public class WordCount {

    static {
        // 让Java程序识别HDFS的URL
        URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
    }

    public static void urlway(String fileurl) {
        InputStream in = null;
        try {
            in = new URL(fileurl).openStream();
            // 4096是复制缓冲区的大小
            IOUtils.copyBytes(in, System.out, 4096);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            IOUtils.closeStream(in);
        }
    }

    // 获得FileSystem
    public static FileSystem GetFileSystem() {
        FileSystem hdfs = null;
        try {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", "hdfs://localhost.localdomain:9000");
            hdfs = FileSystem.get(conf);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return hdfs;
    }

    // 上传文件
    public static void upload() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path srcpath = new Path("c:/123.log");
        Path path = new Path("/in/dir");
        hdfs.copyFromLocalFile(srcpath, path);
        //=hdfs.copyFromLocalFile(false, true, srcpath, path);
    }

    // 查看目录
    public static void scandir() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path path = new Path("/in/dir");
        FileStatus[] fileStatus = hdfs.listStatus(path);
        for (FileStatus fs : fileStatus) {
            Path p = fs.getPath();
            String info = fs.isFile() ? "file" : "dir";
            System.out.println(p.toString()+":"+info);
        }
    }

    // 创建目录
    public static void makedir() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path path = new Path("/in/dirx");
        boolean isSuc = hdfs.mkdirs(path);
        if (isSuc) System.out.println("mkdir yes");
        else System.out.println("mkdir no");
    }

    // 创建文件并写入内容
    public static void createandwrite() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path path = new Path("/in/dir/789.log");
        FSDataOutputStream s = hdfs.create(path);
        s.writeChars("hello");
        s.close();
    }

    // 删除重命名
    public static void delorrename() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path src = new Path("/in/dir/789.log");
        Path dst = new Path("/in/dir/0.log");
        hdfs.rename(src, dst);
        Path del = new Path("/in/dir/456.log");
        hdfs.deleteOnExit(del);
    }

    // 读取文件内容
    public static void read() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path path = new Path("/record.txt");
        FSDataInputStream inStream = hdfs.open(path);
        // 读取文件到控制台
        IOUtils.copyBytes(inStream, System.out, 4096);
        IOUtils.closeStream(inStream);
    }

    // 文件在集群中位置
    public static void getstatus() throws Exception {
        FileSystem hdfs = GetFileSystem();
        Path path = new Path("/in/dir/0.log");
        FileStatus fileStatus = hdfs.getFileStatus(path);
        BlockLocation[] blockLocations = hdfs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        for(BlockLocation blockLocation:blockLocations) {
            String[] hosts = blockLocation.getHosts();
            for(String host:hosts) {
                System.out.print(host+" ");
            }
        }
    }

    // 集群所有节点信息
    public static void getcluster() throws Exception {
        FileSystem hdfs = GetFileSystem();
        DistributedFileSystem dfs = (DistributedFileSystem)hdfs;

        DatanodeInfo[] inos = dfs.getDataNodeStats();
        for(DatanodeInfo info:inos) {
            String hostname = info.getHostName();
            System.out.print(hostname+" ");
        }
    }

    /**
     * 
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        urlway("hdfs://localhost.localdomain:9000/in/dir/0.log");
        getcluster();
    }
}

几个API

public abstract class FileSystem extends Configured implements Closeable {

  /**
   * The src file is on the local disk.  Add it to FS at
   * the given dst name.
   * delSrc indicates if the source should be removed
   * @param delSrc whether to delete the src
   * @param overwrite whether to overwrite an existing file
   * @param src path
   * @param dst path
   */
  public void copyFromLocalFile(boolean delSrc, boolean overwrite,
                                Path src, Path dst)
    throws IOException {
    Configuration conf = getConf();
    FileUtil.copy(getLocal(conf), src, this, dst, delSrc, overwrite, conf);
  }

  /**
   * Copies from one stream to another.
   *
   * @param in InputStrem to read from
   * @param out OutputStream to write to
   * @param buffSize the size of the buffer
   */
  public static void copyBytes(InputStream in, OutputStream out, int buffSize)
    throws IOException {
    PrintStream ps = out instanceof PrintStream ? (PrintStream)out : null;
    byte buf[] = new byte[buffSize];
    int bytesRead = in.read(buf);
    while (bytesRead >= 0) {
      out.write(buf, 0, bytesRead);
      if ((ps != null) && ps.checkError()) {
        throw new IOException("Unable to write to output stream.");
      }
      bytesRead = in.read(buf);
    }
  }

  /**
   * Return an array containing hostnames, offset and size of 
   * portions of the given file.  For a nonexistent 
   * file or regions, null will be returned.
   *
   * This call is most helpful with DFS, where it returns 
   * hostnames of machines that contain the given file.
   *
   * The FileSystem will simply return an elt containing 'localhost'.
   *
   * @param p path is used to identify an FS since an FS could have
   *          another FS that it could be delegating the call to
   * @param start offset into the given file
   * @param len length for which to get locations for
   */
  public BlockLocation[] getFileBlockLocations(Path p, 
      long start, long len) throws IOException {
    if (p == null) {
      throw new NullPointerException();
    }
    FileStatus file = getFileStatus(p);
    return getFileBlockLocations(file, start, len);
  }
}

HDFS的Java API

hdfs文件读取流程">hdfs文件读取流程

hdfs文件写入流程">hdfs文件写入流程

hdfs内文件内容">url方式读取HDFS内文件内容

用FileSystem类操作

几个API

相关文章

Codeforces 385B Bear and Strings(字符串）

Codeforces 385A Bear and Raspberry(水题）

用XFire 实现webservice

Hadoop的序列化和数据类型

【索引】CodeForces Round #226 (Div. 2)

101. Symmetric Tree

STL之numeric

uva 11572 - Unique Snowflakes(Towpointer)