Tuesday, November 12, 2013

Configuring Hue 3.0


Configuring Hue with hadoop and hive in a pseudo distributed mode.

Prerequisites:

  • Hue 3.0
  • Aapche Hadoop 2.2
  • Apache Hive 0.12.0
Note:
Hue 3.0 only works with Hive 0.12.0
Hadoop should have the webhdfs feature.
Hive should have the hiveserver2.

Configurations:


core-site.xml

<configuration>
  <property>
    <name>fs.default.name</name>
    <value>hdfs://localhost:9000</value>
  </property>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/usr/local/hadoop-2.2.0/tmp</value>
  </property> 
<!-- Hue WebHDFS proxy user setting -->
  <property>
    <name>hadoop.proxyuser.hue.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hue.groups</name>
    <value>*</value>
  </property>
</configuration>

hdfs-site.xml

<configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
</property>
  <property>
    <name>dfs.permissions</name>
    <value>false</value>
</property>
  <property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
  </property>
</configuration>



mapred-site.xml

<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
 </property>
</configuration>



yarn-site.xml

<configuration>
<!-- Site specific YARN configuration properties -->
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
  <property>
    <name>yarn.resourcemanager.resource-tracker.address</name>
    <value>localhost:8025</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.address</name>
    <value>localhost:8030</value>
  </property>
  <property>
    <name>yarn.resourcemanager.address</name>
    <value>localhost:8060</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.class</name>
    <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
    <description>In case you do not want to use the default scheduler</description>
  </property>
  <property>
    <name>yarn.nodemanager.address</name>
    <value>localhost:8050</value>
    <description>the nodemanagers bind to this port</description>
  </property>
</configuration>


hue.ini

#####################################
# DEVELOPMENT EDITION
#####################################

# Hue configuration file
# ===================================
#
# For complete documentation about the contents of this file, run
#       $ /build/env/bin/hue config_help
#
# All .ini files under the current directory are treated equally.  Their
# contents are merged to form the Hue configuration, which can
# can be viewed on the Hue at
#       http://:/dump_config


###########################################################################
# General configuration for core Desktop features (authentication, etc)
###########################################################################

[desktop]

  send_dbug_messages=1

  # To show database transactions, set database_logging to 1
  database_logging=0

  # Set this to a random string, the longer the better.
  # This is used for secure hashing in the session store.
  secret_key=abcdefghijklmnopqrstuvwxyz1234567890

  # Webserver listens on this address and port
  http_host=0.0.0.0
  http_port=8000

  # Time zone name
  time_zone=America/Los_Angeles

  # Enable or disable Django debug mode
  ## django_debug_mode=true

  # Enable or disable backtrace for server error
  ## http_500_debug_mode=true

  # Server email for internal error messages
  ## django_server_email='hue@localhost.localdomain'

  # Email backend
  ## django_email_backend=django.core.mail.backends.smtp.EmailBackend

  # Webserver runs as this user
  ## server_user=hue
  ## server_group=hue

  # If set to false, runcpserver will not actually start the web server.
  # Used if Apache is being used as a WSGI container.
  ## enable_server=yes

  # Number of threads used by the CherryPy web server
  ## cherrypy_server_threads=10

  # Filename of SSL Certificate
  ## ssl_certificate=

  # Filename of SSL RSA Private Key
  ## ssl_private_key=

  # Default encoding for site data
  ## default_site_encoding=utf-8

  # Help improve Hue with anonymous usage analytics.
  # Use Google Analytics to see how many times an application or specific section of an application is used, nothing more.
  ## collect_usage=true

  # Administrators
  # ----------------
  [[django_admins]]
    ## [[[admin1]]]
    ## name=john
    ## email=john@doe.com

  # UI customizations
  # -------------------
  [[custom]]

  # Top banner HTML code
  ## banner_top_html=

  # Configuration options for user authentication into the web application
  # ------------------------------------------------------------------------
  [[auth]]

    # Authentication backend. Common settings are:
    # - django.contrib.auth.backends.ModelBackend (entirely Django backend)
    # - desktop.auth.backend.AllowAllBackend (allows everyone)
    # - desktop.auth.backend.AllowFirstUserDjangoBackend
    #     (Default. Relies on Django and user manager, after the first login)
    # - desktop.auth.backend.LdapBackend
    # - desktop.auth.backend.PamBackend
    # - desktop.auth.backend.SpnegoDjangoBackend
    # - desktop.auth.backend.RemoteUserDjangoBackend
    # - desktop.auth.backend.OAuthBackend
    # - libsaml.backend.SAML2Backend
    ## backend=desktop.auth.backend.AllowFirstUserDjangoBackend

    # Backend to synchronize user-group membership with
    ## user_group_membership_synchronization_backend=desktop.auth.backend.LdapSynchronizationBackend

    ## pam_service=login

    # When using the desktop.auth.backend.RemoteUserDjangoBackend, this sets
    # the normalized name of the header that contains the remote user.
    # The HTTP header in the request is converted to a key by converting
    # all characters to uppercase, replacing any hyphens with underscores
    # and adding an HTTP_ prefix to the name. So, for example, if the header
    # is called Remote-User that would be configured as HTTP_REMOTE_USER
    #
    # Defaults to HTTP_REMOTE_USER
    ## remote_user_header=HTTP_REMOTE_USER

  # Configuration options for connecting to LDAP and Active Directory
  # -------------------------------------------------------------------
  [[ldap]]

    # The search base for finding users and groups
    ## base_dn="DC=mycompany,DC=com"

    # The NT domain to connect to (only for use with Active Directory)
    ## nt_domain=mycompany.com

    # URL of the LDAP server
    ## ldap_url=ldap://auth.mycompany.com

    # A PEM-format file containing certificates for the CA's that
    # Hue will trust for authentication over TLS.
    # The certificate for the CA that signed the
    # LDAP server certificate must be included among these certificates.
    # See more here http://www.openldap.org/doc/admin24/tls.html.
    ## ldap_cert=
    ## use_start_tls=true

    # Distinguished name of the user to bind as -- not necessary if the LDAP server
    # supports anonymous searches
    ## bind_dn="CN=ServiceAccount,DC=mycompany,DC=com"

    # Password of the bind user -- not necessary if the LDAP server supports
    # anonymous searches
    ## bind_password=

    # Pattern for searching for usernames -- Use  for the parameter
    # For use when using LdapBackend for Hue authentication
    ## ldap_username_pattern="uid=,ou=People,dc=mycompany,dc=com"

    # Create users in Hue when they try to login with their LDAP credentials
    # For use when using LdapBackend for Hue authentication
    ## create_users_on_login = true

    # Use search bind authentication.
    ## search_bind_authentication=true

    [[[users]]]

      # Base filter for searching for users
      ## user_filter="objectclass=*"

      # The username attribute in the LDAP schema
      ## user_name_attr=sAMAccountName

      [[[groups]]]

      # Base filter for searching for groups
      ## group_filter="objectclass=*"

      # The username attribute in the LDAP schema
      ## group_name_attr=cn

  # Configuration options for specifying the Desktop Database.  For more info,
  # see http://docs.djangoproject.com/en/1.1/ref/settings/#database-engine
  # ------------------------------------------------------------------------
  [[database]]
    # Database engine is typically one of:
    # postgresql_psycopg2, mysql, or sqlite3
    #
    # Note that for sqlite3, 'name', below is a filename;
    # for other backends, it is the database name.
    ## engine=sqlite3
    ## host=
    ## port=
    ## user=
    ## password=
    ## name=desktop/desktop.db

  # Configuration options for specifying the Desktop session.
  # For more info, see https://docs.djangoproject.com/en/1.4/topics/http/sessions/
  # ------------------------------------------------------------------------
  [[session]]
    # The cookie containing the users' session ID will expire after this amount of time in seconds.
    ## ttl=60*60*24*14

    # The cookie containing the users' session ID will be secure.
    # Should only be enabled with HTTPS.
    ## secure=false

  # Configuration options for connecting to an external SMTP server
  # ------------------------------------------------------------------------
  [[smtp]]

    # The SMTP server information for email notification delivery
    host=localhost
    port=25
    user=
    password=

    # Whether to use a TLS (secure) connection when talking to the SMTP server
    tls=no

    # Default email address to use for various automated notification from Hue
    ## default_from_email=hue@localhost


  # Configuration options for Kerberos integration for secured Hadoop clusters
  # ------------------------------------------------------------------------
  [[kerberos]]

    # Path to Hue's Kerberos keytab file
    ## hue_keytab=
    # Kerberos principal name for Hue
    ## hue_principal=hue/hostname.foo.com
    # Path to kinit
    ## kinit_path=/path/to/kinit


  # Configuration options for using OAuthBackend login
  # ------------------------------------------------------------------------
  [[oauth]]
    # The Consumer key of the application
    ## consumer_key=XXXXXXXXXXXXXXXXXXXXX

    # The Consumer secret of the application
    ## consumer_secret=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

    # The Request token URL
    ## request_token_url=https://api.twitter.com/oauth/request_token

    # The Access token URL
    ## access_token_url=https://api.twitter.com/oauth/access_token

    # The Authorize URL
    ## authenticate_url=https://api.twitter.com/oauth/authorize


###########################################################################
# Settings to configure SAML
###########################################################################

[libsaml]
  # Xmlsec1 binary path. This program should be executable by the user running Hue.
  ## xmlsec_binary=/usr/local/bin/xmlsec1

  # Create users from SSO on login.
  ## create_users_on_login=true

  # Required attributes to ask for from IdP.
  # This requires a comma separated list.
  ## required_attributes=uid

  # Optional attributes to ask for from IdP.
  # This requires a comma separated list.
  ## optional_attributes=

  # IdP metadata in the form of a file. This is generally an XML file containing metadata that the Identity Provider generates.
  ## metadata_file=

  # Private key to encrypt metadata with.
  ## key_file=

  # Signed certificate to send along with encrypted metadata.
  ## cert_file=

  # A mapping from attributes in the response from the IdP to django user attributes.
  ## user_attribute_mapping={'uid':'username'}

  # Have Hue initiated authn requests be signed and provide a certificate.
  ## authn_requests_signed=false

  # Have Hue initiated logout requests be signed and provide a certificate.
  ## logout_requests_signed=false

###########################################################################
# Settings to configure your Hadoop cluster.
###########################################################################

[hadoop]

  # Configuration for HDFS NameNode
  # ------------------------------------------------------------------------
  [[hdfs_clusters]]
    # HA support by using HttpFs

    [[[default]]]
      # Enter the filesystem uri
      fs_defaultfs=hdfs://localhost:9000

      # NameNode logical name.
      ## logical_name=

      # Use WebHdfs/HttpFs as the communication mechanism.
      # This should be the web service root URL, such as
      # http://namenode:50070/webhdfs/v1
      webhdfs_url=http://localhost:50070/webhdfs/v1

      # Change this if your HDFS cluster is Kerberos-secured
      ## security_enabled=false

      # Settings about this HDFS cluster. If you install HDFS in a
      # different location, you need to set the following.

      # Defaults to $HADOOP_HDFS_HOME or /usr/lib/hadoop-hdfs
      hadoop_hdfs_home=/usr/local/hadoop-2.2.0

      # Defaults to $HADOOP_BIN or /usr/bin/hadoop
      hadoop_bin=/usr/local/hadoop-2.2.0/bin/hadoop

      # Defaults to $HADOOP_CONF_DIR or /etc/hadoop/conf
      hadoop_conf_dir=/usr/local/hadoop-2.2.0/etc/hadoop

  # Configuration for YARN (MR2)
  # ------------------------------------------------------------------------
  [[yarn_clusters]]

    [[[default]]]
      # Enter the host on which you are running the ResourceManager
      resourcemanager_host=localhost

      # The port where the ResourceManager IPC listens on
      resourcemanager_port=8032

      # Whether to submit jobs to this cluster
      submit_to=True

      # Change this if your YARN cluster is Kerberos-secured
      ## security_enabled=false

      # Settings about this MR2 cluster. If you install MR2 in a
      # different location, you need to set the following.

      # Defaults to $HADOOP_MR2_HOME or /usr/lib/hadoop-mapreduce
      hadoop_mapred_home=/usr/local/hadoop-2.2.0

      # Defaults to $HADOOP_BIN or /usr/bin/hadoop
      hadoop_bin=/usr/local/hadoop-2.2.0/bin/hadoop

      # Defaults to $HADOOP_CONF_DIR or /etc/hadoop/conf
      hadoop_conf_dir=/usr/local/hadoop-2.2.0/etc/hadoop

      # URL of the ResourceManager API
      resourcemanager_api_url=http://localhost:8088

      # URL of the ProxyServer API
      proxy_api_url=http://localhost:8088

      # URL of the HistoryServer API
      history_server_api_url=http://localhost:19888

  # Configuration for MapReduce (MR1)
  # ------------------------------------------------------------------------
  #[[mapred_clusters]]

  #  [[[default]]]
      # Enter the host on which you are running the Hadoop JobTracker
      #jobtracker_host=localhost

      # The port where the JobTracker IPC listens on
      #jobtracker_port=54311

      # JobTracker logical name.
      ## logical_name=

      # Thrift plug-in port for the JobTracker
      ## thrift_port=9290

      # Whether to submit jobs to this cluster
      #submit_to=True

      # Change this if your MapReduce cluster is Kerberos-secured
      ## security_enabled=false

      # Settings about this MR1 cluster. If you install MR1 in a
      # different location, you need to set the following.

      # Defaults to $HADOOP_MR1_HOME or /usr/lib/hadoop-0.20-mapreduce
      #hadoop_mapred_home=/usr/local/hadoop

      # Defaults to $HADOOP_BIN or /usr/bin/hadoop
      #hadoop_bin=/usr/local/hadoop/bin/hadoop

      # Defaults to $HADOOP_CONF_DIR or /etc/hadoop/conf
      #hadoop_conf_dir=/usr/local/hadoop/conf

    # HA support by specifying multiple clusters
    # e.g.

    # [[[ha]]]
      # Enter the host on which you are running the failover JobTracker
      # jobtracker_host=localhost-ha


###########################################################################
# Settings to configure liboozie
###########################################################################

#[liboozie]
  # The URL where the Oozie service runs on. This is required in order for
  # users to submit jobs.
  ## oozie_url=http://localhost:11000/oozie

  # Requires FQDN in oozie_url if enabled
  ## security_enabled=false

  # Location on HDFS where the workflows/coordinator are deployed when submitted.
  ## remote_deployement_dir=/user/hue/oozie/deployments


###########################################################################
# Settings to configure the Oozie app
###########################################################################

#[oozie]
  # Location on local FS where the examples are stored.
  ## local_data_dir=..../examples

  # Location on local FS where the data for the examples is stored.
  ## sample_data_dir=...thirdparty/sample_data

  # Location on HDFS where the oozie examples and workflows are stored.
  ## remote_data_dir=/user/hue/oozie/workspaces

  # Maximum of Oozie workflows or coodinators to retrieve in one API call.
  ## oozie_jobs_count=100


###########################################################################
# Settings to configure Beeswax with Hive
###########################################################################

[beeswax]

  # Host where Hive server Thrift daemon is running.
  # If Kerberos security is enabled, use fully-qualified domain name (FQDN).
  hive_server_host=localhost

  # Port where HiveServer2 Thrift server runs on.
  hive_server_port=10000

  # Hive configuration directory, where hive-site.xml is located
  hive_conf_dir=/usr/local/hive-0.12.0/conf

  # Timeout in seconds for thrift calls to Hive service
  ## server_conn_timeout=120

  # Path to HiveServer2 start script
  hive_server_bin=/usr/local/hive-0.12.0/bin/hiveserver2

  # Set a LIMIT clause when browsing a partitioned table.
  # A positive value will be set as the LIMIT. If 0 or negative, do not set any limit.
  ## browse_partitioned_table_limit=250


###########################################################################
# Settings to configure Pig
###########################################################################

#[pig]
  # Location of piggybank.jar on local filesystem.
  ## local_sample_dir=/usr/share/hue/apps/pig/examples

  # Location piggybank.jar will be copied to in HDFS.
  ## remote_data_dir=/user/hue/pig/examples


###########################################################################
# Settings to configure Sqoop
###########################################################################

#[sqoop]
  # Sqoop server URL
  ## server_url=http://localhost:12000/sqoop


###########################################################################
# Settings to configure Proxy
###########################################################################

[proxy]
  # Comma-separated list of regular expressions,
  # which match 'host:port' of requested proxy target.
  ## whitelist=(localhost|127\.0\.0\.1):(50030|50070|50060|50075)

  # Comma-separated list of regular expressions,
  # which match any prefix of 'host:port/path' of requested proxy target.
  # This does not support matching GET parameters.
  ## blacklist=()


###########################################################################
# Settings to configure Impala
###########################################################################

#[impala]
  # Host of the Impala Server (one of the Impalad)
  ## server_host=localhost

  # Port of the Impala Server
  ## server_port=21050

  # Kerberos principal
  ## impala_principal=impala/hostname.foo.com

  # Turn on/off impersonation mechanism when talking to Impala
  ## impersonation_enabled=False


###########################################################################
# Settings to configure Hbase
###########################################################################

#[hbase]
  # Comma-separated list of HBase Thrift servers for
  # clusters in the format of '(name|host:port)'.
  ## hbase_clusters=(Cluster|localhost:9090)

  # Hard limit of rows or columns per row fetched before truncating.
  ## truncate_limit = 500


###########################################################################
# Settings to configure Solr Search
###########################################################################

[search]

  # URL of the Solr Server
  ## solr_url=http://localhost:8983/solr/

  # Requires FQDN in solr_url if enabled
  ## security_enabled=false

  ## Query sent when no term is entered
  ## empty_query=*:*


###########################################################################
# Settings to configure Job Designer
###########################################################################

[jobsub]

  # Location on local FS where examples and template are stored.
  ## local_data_dir=..../data

  # Location on local FS where sample data is stored
  ## sample_data_dir=...thirdparty/sample_data


###########################################################################
# Settings to configure Job Browser
###########################################################################

[jobbrowser]
  # Share submitted jobs information with all users. If set to false,
  # submitted jobs are visible only to the owner and administrators.
  ## share_jobs=true


###########################################################################
# Settings to configure the Zookeeper application.
###########################################################################

[zookeeper]

  [[clusters]]

    [[[default]]]
      # Zookeeper ensemble. Comma separated list of Host/Port.
      # e.g. localhost:2181,localhost:2182,localhost:2183
      ## host_ports=localhost:2181

      # The URL of the REST contrib service (required for znode browsing)
      ## rest_url=http://localhost:9998


###########################################################################
# Settings for the User Admin application
###########################################################################

[useradmin]
  # The name of the default user group that users will be a member of
  ## default_user_group=default



Permissions

  • The HDFS directory identified by the hadoop.tmp.dir will have to be writable by everyone.
  • The embedded metastore directory under hive should be writable by everyone.
Todo
     Coming soon, user authentication configuration in hue.

No comments:

Post a Comment