Nutch2.3.1源码开发环境搭建

    xiaoxiao2026-03-18  4

    源码下载修改配置文件编译项目 导入intellij idea调整依赖顺序运行测试联系作者

    源码下载

    下载地址:http://nutch.apache.org/downloads.html 解压后得到目录apache-nutch-2.3.1,进入该目录。

    修改配置文件

    修改配置文件conf/nutch-site.xml

    <!-- Put site-specific property overrides in this file. --> <configuration> <!--此参数主要用于在IDE环境开发模式运行,在构建输出的runtime部署运行请注释或删除此项参数--> <!-- Just for development, please remove this plugin.folders for production env --> <property> <name>plugin.folders</name> <value>./src/plugin</value> </property> <!--基于gora的爬虫数据底层存储机制,--> <!--官方文档及推荐为HBase,本项目默认配置为MongoDB。需要同步配置gora.properties文件中相关参数。--> <property> <name>storage.data.store.class</name> <value>org.apache.gora.mongodb.store.MongoStore</value> <description>Default class for storing data</description> </property> <property> <name>http.agent.name</name> <value>Your Nutch Spider</value> </property> </configuration>

    修改ivy/ivy.xml文件 取消mongodb注释

    <!-- Uncomment this to use MongoDB as Gora backend. --> <dependency org="org.apache.gora" name="gora-mongodb" rev="0.6.1" conf="*->default" />

    修改conf/gora.properties文件配置mongodb

    ############################ # MongoDBStore properties # ############################ gora.datastore.default=org.apache.gora.mongodb.store.MongoStore gora.mongodb.override_hadoop_configuration=false gora.mongodb.mapping.file=/gora-mongodb-mapping.xml gora.mongodb.servers=localhost:27017 gora.mongodb.db=nutchFocuse #gora.mongodb.login=login #gora.mongodb.secret=secret

    编译项目 导入intellij idea

    在该目录下分别执行ant clean,ant,ant eclipse。执行完成后。打开intellij idea import Project->选择apache-nutch-2.3.1目录->import project from external model(选择eclipse),之后一路next即可。

    调整依赖顺序

    调整依赖顺序 1.前三个依赖顺序为conf,Module source,1.8(jdk)

    运行测试

    在该目录下建立文件夹urls,在文件夹下建立文件seed.txt 该文件用于存储种子url。工程搭建完成后目录结构如下图所示: 我根据crawl脚本“直译”了一个java类(crawl)方便用于调试

    /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.fetcher.FetcherJob; import org.apache.nutch.indexer.IndexingJob; import org.apache.nutch.indexer.solr.SolrDeleteDuplicates; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.ParserJob; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.StringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Random; // Commons Logging imports public class Crawl extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(Crawl.class); /* Perform complete crawling and indexing (to Solr) given a set of root urls and the -solr parameter respectively. More information and Usage parameters can be found below. */ public static void main(String args[]) throws Exception { Configuration conf = NutchConfiguration.create(); String[] parameter = new String[3]; parameter[0] = "urls"; parameter[1] = "testcrawlid"; // parameter[2] = "http://localhost:8080/solr"; // parameter[3] = "1"; parameter[2] = "1"; int res = ToolRunner.run(conf, new Crawl(), parameter); System.exit(res); } @Override public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println ("Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"); return -1; } String seedDir = args[0]; String crawlId = args[1]; String limit="",solrUrl=""; if (args.length==3){ limit = args[2]; }else if (args.length==4){ solrUrl = args[2]; limit = args[3]; }else { System.out.println("参数个数不匹配,检查输入参数"); } if (StringUtil.isEmpty(seedDir)){ System.out.println("Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"); } if (StringUtil.isEmpty(crawlId)){ System.out.println("Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"); } if (StringUtil.isEmpty(solrUrl)){ System.out.println("No SOLRURL specified. Skipping indexing."); } if (StringUtil.isEmpty(limit)){ System.out.println("Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"); } //MODIFY THE PARAMETERS BELOW TO YOUR NEEDS //set the number of slaves nodes int numSlaves=1; //and the total number of available tasks //sets Hadoop parameter "mapred.reduce.tasks" int numTasks= numSlaves<<1; // number of urls to fetch in one iteration //250K per task? // int sizeFetchlist=numSlaves * 5; int sizeFetchlist=10; //time limit for feching String timeLimitFetch="180"; //Adds <days> to the current time to facilitate //crawling urls already fetched sooner then //db.default.fetch.interval. int addDays=0; getConf().set("mapred.reduce.tasks", String.valueOf(numTasks)); getConf().set("mapred.child.java.opts","-Xmx1000m"); getConf().set("mapred.reduce.tasks.speculative.execution","false"); getConf().set("mapred.map.tasks.speculative.execution","false"); getConf().set("mapred.compress.map.output","true"); InjectorJob injector = new InjectorJob(getConf()); GeneratorJob generator = new GeneratorJob(getConf()); FetcherJob fetcher = new FetcherJob(getConf()); ParserJob parse = new ParserJob(getConf()); DbUpdaterJob dbUpdaterJob = new DbUpdaterJob(getConf()); IndexingJob indexingJob = new IndexingJob(); SolrDeleteDuplicates solrDeleteDuplicates = new SolrDeleteDuplicates(); // initialize crawlDb getConf().set(Nutch.CRAWL_ID_KEY, crawlId); int res; String[] injectParameter = new String[3]; injectParameter[0] = seedDir; injectParameter[1] = "-crawlId"; injectParameter[2] = crawlId; System.out.println("initial injection"); res = ToolRunner.run(getConf(), injector,injectParameter); print(res,"inject"); for (int i = 0; i < Integer.parseInt(limit); i++) { System.out.println("Begin Generate"); String batchId = System.currentTimeMillis()+"-"+new Random().nextInt(32767); String[] generateParameter = new String[10]; // generate new segment generateParameter[0] = "-topN"; generateParameter[1] = String.valueOf(sizeFetchlist); generateParameter[2] = "-noNorm"; generateParameter[3] = "-noFilter"; generateParameter[4] = "-adddays"; generateParameter[5] = String.valueOf(addDays); generateParameter[6] = "-crawlId"; generateParameter[7] = crawlId; generateParameter[8] = "-batchId"; generateParameter[9] = batchId; res = ToolRunner.run(getConf(), generator,generateParameter); print(res,"generate"); System.out.println("Begin Fetch"); String[] fetchParameter = new String[5]; fetchParameter[0] = batchId; fetchParameter[1] = "-crawlId"; fetchParameter[2] = crawlId; fetchParameter[3] = "-threads"; //线程数量 thread fetchParameter[4] = "10"; getConf().set("fetcher.timelimit.mins",timeLimitFetch); res = ToolRunner.run(getConf(),fetcher, fetchParameter); print(res,"fetch"); /** * 配置文件中 已经在fetch过程中就使用parse 所以这个单独的parse不用在重复调用 */ System.out.println("parse begin"); String[] parseParameter = new String[3]; parseParameter[0] = batchId; parseParameter[1] = "-crawlId"; parseParameter[2] = crawlId; getConf().set("mapred.skip.attempts.to.start.skipping","2"); getConf().set("mapred.skip.map.max.skip.records","1"); res = ToolRunner.run(getConf(), parse,parseParameter); if (res==0){ System.out.println("parse finish"); }else { System.out.println("parse failed"); } //updatedb with this batch System.out.println("begin updatedb"); String[] updatedbParameter = new String[3]; updatedbParameter[0] = batchId; updatedbParameter[1] = "-crawlId"; updatedbParameter[2] = crawlId; res = ToolRunner.run(getConf(),dbUpdaterJob,updatedbParameter); print(res,"updatedb"); if (StringUtil.isEmpty(solrUrl)){ System.out.println("Skipping indexing tasks: no SOLR url provided."); }else { System.out.println("begin Indexing"); getConf().set("solr.server.url",solrUrl); String[] indexingParameter = new String[3]; indexingParameter[0] = "-all"; indexingParameter[1] = "-crawlId"; indexingParameter[2] = crawlId; res = ToolRunner.run(getConf(), indexingJob, indexingParameter); print(res,"indexing"); System.out.println("begin SOLR dedup"); String[] solrdedupParameter = new String[1]; solrdedupParameter[0] = solrUrl; res = ToolRunner.run(getConf(),solrDeleteDuplicates , solrdedupParameter); print(res,"solr Delete Duplicates"); } } return 0; } public static void print(int res,String name ){ if (res==0){ System.out.println(name+" finish"); }else if (res==1){ System.out.println(name+" finish but no more URLs to fetch now,Escaping loop"); }else { System.out.println(name+" failed"); } } }

    先启动mongodb,然后直接直接运行crawl类即可。我的配置默认mongodb是配置在本地机器。 如果要单独运行nutch的每个阶段,如inject、generate、fetch等可以按下面的方法来配置。以inject为例,其他都类似。 在idea里面 点击Edit Configurations..然后点击左上脚+号,选择Application,配置运行的类和参数即可,如下图所示:

    联系作者

    相关资源:apache-nutch-2.3.1 源码和构建好的库文件等 (part 4)
    最新回复(0)