| | |
| |
| Hadoop学习- Hadoop初试-统计产品销量 |
|
[ 2011/9/2 14:03:00 | By: 梦翔儿 ] |
参照官方wordcount示例,统计每个产品的销量 数据: 产品编号 销量 131B 64 3CB2 61 BC1A 41 CCC2 59 ACC2 92 131B 6 3CB2 32 3CB2 36 BC1A 48 ACC2 40 将相同的产品编号的销量统计出来 程序如下:
1,Mapper: package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper;
/** * @author sunjun * @create 2010-7-1 下午10:26:17 */ public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (!(line == null || "".equals(line))) { String[] array = line.split("\\t"); if (array != null && array.length == 2) { context.write(new Text(array[0].trim()), new IntWritable( Integer.parseInt(array[1].trim()))); } } }
}
2,Reducer: package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer;
/** * @author sunjun * @create 2010-7-3 下午01:02:53 */ public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } context.write(key, new IntWritable(sum)); }
}
3,生成测试数据: /* * 生成测试数据,存放在C:\cygwin\home\Administrator\hadoop-0.20.2\test-data目录下,生成5个.txt文件 */ package com.sun.hadoop;
import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; import java.util.Random;
/** * @author sunjun * @create 2010-7-3 下午01:49:15 */ public class GenerateDate {
private static char[] chars = "ABC123".toCharArray();
/** * @param args */ public static void main(String[] args) { for (int i = 1; i < 6; i++) generate(i + ".txt"); System.out.println("over."); }
/** * @param fileName */ private static void generate(String fileName) { StringBuilder str = new StringBuilder(); int len = chars.length; int count = 10000; for (int i = 0; i < count; i++) { Random random = new Random(); for (int j = 0; j < 4; j++) str.append(chars[random.nextInt(len)]); str.append("\t").append(random.nextInt(100)); if (i < count - 1) str.append("\n"); } try { OutputStream output = new FileOutputStream(new File( "C:\\cygwin\\home\\Administrator\\hadoop-0.20.2\\test-data\\" + fileName)); output.write(str.toString().getBytes()); output.flush(); output.close(); } catch (Exception e) { e.printStackTrace(); } } }
4,测试代码: /* * 该程序运行需要传递两个参数:输入目录,输出目录 */ package com.sun.hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser;
/** * @author sunjun * @create 2010-7-3 下午01:08:07 */ public class TestMy {
public static void main(String[] args) { // if (args.length < 2) // throw new NullPointerException("args is not 2 params.");
Configuration config = new Configuration(); String[] otherArgs = new GenericOptionsParser(config, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); }
try { Job job = new Job(config, "sale total"); job.setJarByClass(TestMy.class);
job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); }
}
}
在hadoop上运行: 1,在独立模式的hadoop上运行 先确保配置文件是配置的独立模式,而不是伪分布模式 将编译的classes文件复制到C:\cygwin\home\Administrator\hadoop-0.20.2
|
|
| | | |