I have been configuring Hadoop nodes at work with the downloaded tar ball from Apache. I have even created a basic RPM from it so that it is easy to install with Puppet on the CentOS servers.
I simply install Hadoop in the /opt directory by untarring it and creating a symlink for /opt/hadoop that point to /opt/hadoop-2.4.1.
I added a few lines to my .bashrc user that runs Hadoop:
export JAVA_HOME=/usr/lib/jvm/jre
export HADOOP_INSTALL=/opt/hadoop
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_INSTALL/lib/nativeexport PATH=$PATH:$HADOOP_INSTALL/sbin:$HADOOP_INSTALL/bin:$JAVA_HOME/bin
I have also configured some Hadoop configuration file in /opt/hadoop/etc/hadoop:
mapred-site.xml
<?xml version=”1.0″?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<name>yarn.resourcemanager.address</name>
<value>127.0.0.1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>127.0.0.1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>127.0.0.1:8031</value>
</property>
</configuration>
<?xml-stylesheet type=”text/xsl” href=”configuration.xsl”?>
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>file:///data/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>file:///data/datanode</value>
</property>
</configuration>