/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* This program uses map/reduce to just run a distributed job where there is
* no interaction between the tasks and each task writes a large unsorted
* random sequence of words.
* In order for this program to generate data for terasort with a 5-10 words
* per key and 20-100 words per value, have the following config:
*
{@code
*
*
*
*
* mapreduce.randomtextwriter.minwordskey
* 5
*
*
* mapreduce.randomtextwriter.maxwordskey
* 10
*
*
* mapreduce.randomtextwriter.minwordsvalue
* 20
*
*
* mapreduce.randomtextwriter.maxwordsvalue
* 100
*
*
* mapreduce.randomtextwriter.totalbytes
* 1099511627776
*
* }
*
* Equivalently, {@link RandomTextWriter} also supports all the above options
* and ones supported by {@link Tool} via the command-line.
*
* To run: bin/hadoop jar hadoop-${version}-examples.jar randomtextwriter
* [-outFormat output format class] output
*/
public class RandomTextWriter extends Configured implements Tool {
public static final String TOTAL_BYTES =
"mapreduce.randomtextwriter.totalbytes";
public static final String BYTES_PER_MAP =
"mapreduce.randomtextwriter.bytespermap";
public static final String MAPS_PER_HOST =
"mapreduce.randomtextwriter.mapsperhost";
public static final String MAX_VALUE = "mapreduce.randomtextwriter.maxwordsvalue";
public static final String MIN_VALUE = "mapreduce.randomtextwriter.minwordsvalue";
public static final String MIN_KEY = "mapreduce.randomtextwriter.minwordskey";
public static final String MAX_KEY = "mapreduce.randomtextwriter.maxwordskey";
static int printUsage() {
System.out.println("randomtextwriter " +
"[-outFormat