Monday, September 5, 2016

Apache Nutch




ubuntu@node2:~$ docker exec -it hbase bash
root@45883500b170:/# 
root@45883500b170:/# 
root@45883500b170:/# 
root@45883500b170:/# useradd nutch -m -s /bin/bash
root@45883500b170:/# passwd nutch
Enter new UNIX password: 
Retype new UNIX password: 
passwd: password updated successfully
root@45883500b170:/# 
root@45883500b170:/# 
root@45883500b170:/# 
root@45883500b170:/# exit
exit
ubuntu@node2:~$ docker exec -it --user nutch hbase bash
nutch@45883500b170:/$ 
nutch@45883500b170:/$ 
nutch@45883500b170:/$ pwd          
/
nutch@45883500b170:/$ cd
nutch@45883500b170:~$ pwd
/home/nutch
nutch@45883500b170:~$ tar xzvf /software/apache-nutch-2.3.1-src.tar.gz 
apache-nutch-2.3.1/conf/
apache-nutch-2.3.1/docs/
apache-nutch-2.3.1/docs/api/
apache-nutch-2.3.1/docs/api/org/
apache-nutch-2.3.1/docs/api/org/apache/
apache-nutch-2.3.1/docs/api/org/apache/nutch/
apache-nutch-2.3.1/docs/api/org/apache/nutch/analysis/
apache-nutch-2.3.1/docs/api/org/apache/nutch/analysis/lang/
apache-nutch-2.3.1/docs/api/org/apache/nutch/analysis/lang/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/impl/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/impl/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/impl/db/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/impl/db/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/misc/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/misc/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/model/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/model/request/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/model/request/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/model/response/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/model/response/class-use/
apache-nutch-2.3.1/docs/api/org/apache/nutch/api/resources/
...

$NUTCH_HOME/ivy/ivy.xml :

<dependency org="org.apache.gora" name="gora-hbase" rev="0.6.1" conf="*->default" />
    <dependency org="org.apache.hbase" name="hbase-common" rev="0.98.8-hadoop2" conf="*->default" />


$NUTCH_HOME/conf/gora.properties :

############################
# HBaseStore properties  #
############################
gora.datastore.default=org.apache.gora.hbase.store.HBaseStore
gora.datastore.autocreateschema=true
gora.datastore.scanner.caching=1000
hbase.client.autoflush.default=false


nutch@45883500b170:~/apache-nutch-2.3.1$ ant clean
Buildfile: /home/nutch/apache-nutch-2.3.1/build.xml
Trying to override old definition of task javac
  [taskdef] Could not load definitions from resource org/sonar/ant/antlib.xml. It could not be found.

clean-build:
   [delete] Deleting directory /home/nutch/apache-nutch-2.3.1/build

clean-lib:

clean-dist:

clean-runtime:

clean:

BUILD SUCCESSFUL
Total time: 0 seconds
nutch@45883500b170:~/apache-nutch-2.3.1$ ant runtime
Buildfile: /home/nutch/apache-nutch-2.3.1/build.xml
Trying to override old definition of task javac
  [taskdef] Could not load definitions from resource org/sonar/ant/antlib.xml. It could not be found.

ivy-probe-antlib:

ivy-download:
  [taskdef] Could not load definitions from resource org/sonar/ant/antlib.xml. It could not be found.

ivy-download-unchecked:

ivy-init-antlib:

ivy-init:

init:
    [mkdir] Created dir: /home/nutch/apache-nutch-2.3.1/build
    [mkdir] Created dir: /home/nutch/apache-nutch-2.3.1/build/classes
    [mkdir] Created dir: /home/nutch/apache-nutch-2.3.1/build/release
    [mkdir] Created dir: /home/nutch/apache-nutch-2.3.1/build/test
    [mkdir] Created dir: /home/nutch/apache-nutch-2.3.1/build/test/classes

clean-lib:

resolve-default:
[ivy:resolve] :: Apache Ivy 2.4.0 - 20141213170938 :: http://ant.apache.org/ivy/ ::
[ivy:resolve] :: loading settings :: file = /home/nutch/apache-nutch-2.3.1/ivy/ivysettings.xml
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/apache/solr/solr-solrj/4.6.0/solr-solrj-4.6.0.jar ...
[ivy:resolve] ...........
[ivy:resolve] .............................
[ivy:resolve] . (393kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.apache.solr#solr-solrj;4.6.0!solr-solrj.jar (4382ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/2.5.2/hadoop-common-2.5.2.jar ...
[ivy:resolve] .................
[ivy:resolve] ...............................
[ivy:resolve] ................
[ivy:resolve] .......................
[ivy:resolve] ........................
[ivy:resolve] ........................
[ivy:resolve] .........................
[ivy:resolve] .......................
[ivy:resolve] ............................
[ivy:resolve] ......................
[ivy:resolve] ............................
[ivy:resolve] ......................
[ivy:resolve] ............ (2894kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.apache.hadoop#hadoop-common;2.5.2!hadoop-common.jar (21544ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs/2.5.2/hadoop-hdfs-2.5.2.jar ...
[ivy:resolve] ...................................
[ivy:resolve] ...................................
[ivy:resolve] ......................................
[ivy:resolve] .......................................
[ivy:resolve] ....................................
[ivy:resolve] .......................................
[ivy:resolve] ..........................................
[ivy:resolve] .......................................
[ivy:resolve] ..................................
[ivy:resolve] ........................................
[ivy:resolve] ..................................
[ivy:resolve] .........................................
[ivy:resolve] .............................................
[ivy:resolve] ...................................
[ivy:resolve] ......................
[ivy:resolve] .........................................
[ivy:resolve] ..........................................
[ivy:resolve] .............................................
[ivy:resolve] ........................................
[ivy:resolve] .....................................
[ivy:resolve] ............. (6928kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.apache.hadoop#hadoop-hdfs;2.5.2!hadoop-hdfs.jar (33894ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/2.5.2/hadoop-mapreduce-client-core-2.5.2.jar ...
[ivy:resolve] ....................
[ivy:resolve] .......................
[ivy:resolve] .........................
[ivy:resolve] ..............................
[ivy:resolve] ...............
[ivy:resolve] ...................
[ivy:resolve] ................. (1463kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.apache.hadoop#hadoop-mapreduce-client-core;2.5.2!hadoop-mapreduce-client-core.jar (12531ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-jobclient/2.5.2/hadoop-mapreduce-client-jobclient-2.5.2.jar ...
[ivy:resolve] .. (34kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.apache.hadoop#hadoop-mapreduce-client-jobclient;2.5.2!hadoop-mapreduce-client-jobclient.jar (1075ms)
[ivy:resolve] downloading http://maven.restlet.org/org/restlet/jse/org.restlet/2.2.3/org.restlet-2.2.3.jar ...
[ivy:resolve] ......................
[ivy:resolve] ..........................
[ivy:resolve] ......................... (670kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.restlet.jse#org.restlet;2.2.3!org.restlet.jar (7877ms)
[ivy:resolve] downloading http://maven.restlet.org/org/restlet/jse/org.restlet.ext.jackson/2.2.3/org.restlet.ext.jackson-2.2.3.jar ...
[ivy:resolve] ... (7kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.restlet.jse#org.restlet.ext.jackson;2.2.3!org.restlet.ext.jackson.jar (2971ms)
[ivy:resolve] downloading http://maven.restlet.org/org/restlet/jse/org.restlet.ext.jaxrs/2.2.3/org.restlet.ext.jaxrs-2.2.3.jar ...
[ivy:resolve] ...................
[ivy:resolve] ............ (305kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] org.restlet.jse#org.restlet.ext.jaxrs;2.2.3!org.restlet.ext.jaxrs.jar (5760ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/junit/junit/4.11/junit-4.11.jar ...
[ivy:resolve] ....................... (239kB)
[ivy:resolve] .. (0kB)
[ivy:resolve]  [SUCCESSFUL ] junit#junit;4.11!junit.jar (718ms)
[ivy:resolve] downloading http://repo1.maven.org/maven2/org/hsqldb/hsqldb/2.2.8/hsqldb-2.2.8.jar ...
[ivy:resolve] ............................
[ivy:resolve] .........................
[ivy:resolve] ....................


Configure Nutch

$NUTCH_HOME/runtime/local/conf/nutch-site.xml :

<configuration>
 <property>
    <name>http.agent.name</name>
    <value>Nutty Spider</value>
  </property>
  <property>
    <name>storage.data.store.class</name>
    <value>org.apache.gora.hbase.store.HBaseStore</value>
    <description>Default class for storing data</description>
  </property>
  <property>
    <name>plugin.includes</name>     <value>protocol-httpclient|urlfilter-regex|parse-(text|tika|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|indexer-elastic</value>
  </property>
  <property>
    <name>db.ignore.external.links</name>
    <value>true</value>
  </property>
  <property>
    <name>elastic.host</name>
    <value>10.0.2.41</value>
  </property>
  <property>
    <name>elastic.port</name>
    <value>9300</value>
  </property>
  <property>
    <name>elastic.cluster</name>
    <value>elasticsearch</value>
  </property>
  <property>
    <name>elastic.index</name>
    <value>nutchindex</value>
  </property>
  <property>
    <name>parser.character.encoding.default</name>
    <value>utf-8</value>
  </property>
  <property>
    <name>http.content.limit</name>
    <value>6553600</value>
  </property>
  <property>
  <name>elastic.max.bulk.docs</name>
  <value>250</value>
<description>Maximum size of the bulk in number of documents.</description>
</property>
<property>
  <name>elastic.max.bulk.size</name>
  <value>2500500</value>
  <description>Maximum size of the bulk in bytes.</description>
</property>
</configuration>


Simple test


nutch@45883500b170:~/apache-nutch-2.3.1/runtime/local/bin$ ./nutch inject ~/nutch/testseed 
InjectorJob: starting at 2016-08-30 09:48:49
InjectorJob: Injecting urlDir: /home/nutch/nutch/testseed
InjectorJob: Using class org.apache.gora.hbase.store.HBaseStore as the Gora storage class.
InjectorJob: total number of urls rejected by filters: 0
InjectorJob: total number of urls injected after normalization and filtering: 0
Injector: finished at 2016-08-30 09:48:58, elapsed: 00:00:08


Crawling the web and indexing by Elasticsearch


9300 - Elasticsearch native java port
9200 - RESTful API

nutch@45883500b170:~$ cat seed/urls.txt 
https://en.wikipedia.org
nutch@45883500b170:~$ nutch inject seed/urls.txt 
InjectorJob: starting at 2016-08-30 10:24:37
InjectorJob: Injecting urlDir: seed/urls.txt
InjectorJob: Using class org.apache.gora.hbase.store.HBaseStore as the Gora storage class.
InjectorJob: total number of urls rejected by filters: 0
InjectorJob: total number of urls injected after normalization and filtering: 1
Injector: finished at 2016-08-30 10:24:41, elapsed: 00:00:03
nutch@45883500b170:~$ nutch generate -topN 40
GeneratorJob: starting at 2016-08-30 10:25:02
GeneratorJob: Selecting best-scoring urls due for fetch.
GeneratorJob: starting
GeneratorJob: filtering: true
GeneratorJob: normalizing: true
GeneratorJob: topN: 40
GeneratorJob: finished at 2016-08-30 10:25:07, time elapsed: 00:00:04
GeneratorJob: generated batch id: 1472552702-144817008 containing 1 URLs
nutch@45883500b170:~$ nutch fetch -all
FetcherJob: starting at 2016-08-30 10:25:16
FetcherJob: fetching all
FetcherJob: threads: 10
FetcherJob: parsing: false
FetcherJob: resuming: false
FetcherJob : timelimit set for : -1
Using queue mode : byHost
Fetcher: threads: 10
QueueFeeder finished: total 1 records. Hit by time limit :0
Fetcher: throughput threshold: -1
Fetcher: throughput threshold sequence: 5
fetching https://en.wikipedia.org/ (queue crawl delay=5000ms)
-finishing thread FetcherThread2, activeThreads=8
-finishing thread FetcherThread6, activeThreads=6
-finishing thread FetcherThread5, activeThreads=5
-finishing thread FetcherThread7, activeThreads=7
-finishing thread FetcherThread8, activeThreads=7
-finishing thread FetcherThread4, activeThreads=4
-finishing thread FetcherThread3, activeThreads=3
-finishing thread FetcherThread1, activeThreads=2
-finishing thread FetcherThread9, activeThreads=1
-finishing thread FetcherThread0, activeThreads=0
0/0 spinwaiting/active, 1 pages, 0 errors, 0.2 0 pages/s, 0 0 kb/s, 0 URLs in 0 queues
-activeThreads=0
Using queue mode : byHost
Fetcher: threads: 10
QueueFeeder finished: total 0 records. Hit by time limit :0
Fetcher: throughput threshold: -1
Fetcher: throughput threshold sequence: 5
-finishing thread FetcherThread9, activeThreads=9
-finishing thread FetcherThread1, activeThreads=8
-finishing thread FetcherThread2, activeThreads=7
-finishing thread FetcherThread0, activeThreads=1
-finishing thread FetcherThread7, activeThreads=2
-finishing thread FetcherThread6, activeThreads=3
-finishing thread FetcherThread5, activeThreads=4
-finishing thread FetcherThread4, activeThreads=5
-finishing thread FetcherThread3, activeThreads=6
-finishing thread FetcherThread8, activeThreads=0
0/0 spinwaiting/active, 0 pages, 0 errors, 0.0 0 pages/s, 0 0 kb/s, 0 URLs in 0 queues
-activeThreads=0
FetcherJob: finished at 2016-08-30 10:25:32, time elapsed: 00:00:16
nutch@45883500b170:~$ 








No comments:

Post a Comment