当前位置: 移动技术网 > IT编程>数据库>其他数据库 > Java实现MapReduce Wordcount案例

Java实现MapReduce Wordcount案例

2019年11月02日  | 移动技术网IT编程  | 我要评论

先改pom.xml:

<project xmlns="http://maven.apache.org/pom/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/xmlschema-instance"
	xsi:schemalocation="http://maven.apache.org/pom/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelversion>4.0.0</modelversion>
	<groupid>com.mcq</groupid>
	<artifactid>mr-1101</artifactid>
	<version>0.0.1-snapshot</version>
	<dependencies>
		<dependency>
			<groupid>jdk.tools</groupid>
			<artifactid>jdk.tools</artifactid>
			<version>1.8</version>
			<scope>system</scope>
			<systempath>${java_home}/lib/tools.jar</systempath>
		</dependency>
		<dependency>
			<groupid>junit</groupid>
			<artifactid>junit</artifactid>
			<version>release</version>
		</dependency>
		<dependency>
			<groupid>org.apache.logging.log4j</groupid>
			<artifactid>log4j-core</artifactid>
			<version>2.8.2</version>
		</dependency>
		<dependency>
			<groupid>org.apache.hadoop</groupid>
			<artifactid>hadoop-common</artifactid>
			<version>2.7.2</version>
		</dependency>
		<dependency>
			<groupid>org.apache.hadoop</groupid>
			<artifactid>hadoop-client</artifactid>
			<version>2.7.2</version>
		</dependency>
		<dependency>
			<groupid>org.apache.hadoop</groupid>
			<artifactid>hadoop-hdfs</artifactid>
			<version>2.7.2</version>
		</dependency>
	</dependencies>
</project>

在resources文件夹下添加文件 log4j.properties:

log4j.rootlogger=info, stdout
log4j.appender.stdout=org.apache.log4j.consoleappender
log4j.appender.stdout.layout=org.apache.log4j.patternlayout
log4j.appender.stdout.layout.conversionpattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.fileappender
log4j.appender.logfile.file=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.patternlayout
log4j.appender.logfile.layout.conversionpattern=%d %p [%c] - %m%n

 

 wordcountdriver.java:

package com.mcq;

import java.io.ioexception;

import org.apache.hadoop.conf.configuration;
import org.apache.hadoop.fs.path;
import org.apache.hadoop.io.intwritable;
import org.apache.hadoop.io.text;
import org.apache.hadoop.mapreduce.job;
import org.apache.hadoop.mapreduce.lib.input.fileinputformat;
import org.apache.hadoop.mapreduce.lib.output.fileoutputformat;

public class wordcountdriver{
	public static void main(string[] args) throws ioexception, classnotfoundexception, interruptedexception {
		system.out.println("hello");
		configuration conf=new configuration();
		//1.获取job对象
		job job=job.getinstance(conf);
		//2.设置jar存储位置
		job.setjarbyclass(wordcountdriver.class);
		//3.关联map和reduce类
		job.setmapperclass(wordcountmapper.class);
		job.setreducerclass(wordcountreducer.class);
		//4.设置mapper阶段输出数据的key和value类型
		job.setmapoutputkeyclass(text.class);
		job.setmapoutputvalueclass(intwritable.class);
		//5.设置最终输出的key和value类型
		job.setoutputkeyclass(text.class);
		job.setoutputvalueclass(intwritable.class);
		//6.设置输入路径和输出路径
		fileinputformat.setinputpaths(job, new path(args[0]));
		fileoutputformat.setoutputpath(job, new path(args[1]));
		//7.提交job
//		job.submit();
		job.waitforcompletion(true);
//		boolean res=job.waitforcompletion(true);//true表示打印结果
//		system.exit(res?0:1);
	}
}

 wordcountmapper.java:

package com.mcq;

import java.io.ioexception;

import org.apache.hadoop.io.intwritable;
import org.apache.hadoop.io.longwritable;
import org.apache.hadoop.io.text;
import org.apache.hadoop.mapreduce.mapper;

//map阶段
//keyin:输入数据的key(偏移量,比如第一行是0~19,第二行是20~25),必须是longwritable
//valuein:输入数据的value(比如文本内容是字符串,那就填text)
//keyout:输出数据的key类型
//valueout:输出数据的值类型
public class wordcountmapper extends mapper<longwritable, text, text, intwritable>{
	intwritable v=new intwritable(1);
	text k = new text();
	@override
	protected void map(longwritable key, text value, mapper<longwritable, text, text, intwritable>.context context)
			throws ioexception, interruptedexception {
		// todo auto-generated method stub
		//1.获取一行
		string line=value.tostring();
		//2.切割单词
		string[] words=line.split(" ");
		//3.循环写出
		for(string word:words) {
			k.set(word);
			context.write(k, v);
		}
	}
}

 wordcountreducer.java:

package com.mcq;

import java.io.ioexception;

import org.apache.hadoop.io.intwritable;
import org.apache.hadoop.io.text;
import org.apache.hadoop.mapreduce.reducer;

//keyin、valuein:map阶段输出的key和value类型
public class wordcountreducer extends reducer<text, intwritable, text, intwritable>{
	intwritable v=new intwritable();
	@override
	protected void reduce(text key, iterable<intwritable> values,
			reducer<text, intwritable, text, intwritable>.context context) throws ioexception, interruptedexception {
		// todo auto-generated method stub
		int sum=0;
		for(intwritable value:values) {
			sum+=value.get();
		}
		v.set(sum);
		context.write(key, v);
	}
}

在run configuration里加上参数e:/mrtest/in.txt e:/mrtest/out.txt

 

 

运行时遇到了个bug,参考https://blog.csdn.net/qq_40310148/article/details/86617512解决了

 

在集群上运行:

用maven打成jar包,需要添加一些打包依赖:

	<build>
		<plugins>
			<plugin>
				<artifactid>maven-compiler-plugin</artifactid>
				<version>2.3.2</version>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
				</configuration>
			</plugin>
			<plugin>
				<artifactid>maven-assembly-plugin </artifactid>
				<configuration>
					<descriptorrefs>
						<descriptorref>jar-with-dependencies</descriptorref>
					</descriptorrefs>
					<archive>
						<manifest>
							<mainclass>com.mcq.wordcountdriver</mainclass>
						</manifest>
					</archive>
				</configuration>
				<executions>
					<execution>
						<id>make-assembly</id>
						<phase>package</phase>
						<goals>
							<goal>single</goal>
						</goals>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>

 注意上面mainclass里要填驱动类的主类名,可以点击类名右键copy qualified name。

将程序打成jar包(具体操作:右键工程名run as maven install,然后target文件夹会产生两个jar包,我们把不用依赖的包拷贝到hadoop集群上,因为集群已经配好相关依赖了),上传到集群

输入以下命令运行

hadoop jar mr-1101-0.0.1-snapshot.jar com.mcq.wordcountdriver /xiaocao.txt /output

注意这里输入输出的路径是集群上的路径。

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网