前言
有个需求,需要抓取天气的资讯文章,而该项目是用 Java 写的,由于爬虫需求也较小,所以就索性直接用 Java 来爬。
Java 有个包是 webmagic 本次就是用的这个包做的实践。
我们要采集的网址是 https://www.weather.com.cn/index/jqzdtqsj/index.shtml 打开这个网址可以看到是个列表,并且有分页。
配置 Maven 依赖
1 2 3 4 5 6 7 8 9 10
   | <dependency>     <groupId>us.codecraft</groupId>     <artifactId>webmagic-core</artifactId>     <version>0.10.3</version> </dependency> <dependency>     <groupId>us.codecraft</groupId>     <artifactId>webmagic-extension</artifactId>     <version>0.10.3</version> </dependency>
   | 
 
爬虫启动入口类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
   | package com.sktk.weather.app.spider;
  import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor;
  import javax.annotation.Resource;
 
 
 
  @Component @Slf4j public class WeatherArticleSpider implements PageProcessor {
      @Resource     private WeatherArticleDetailSpider weatherArticleDetailSpider;
      @Override     public void process(Page page) {
                   String pageNum = page.getHtml().xpath("/html/body/div[3]/div[1]/div/div/span[3]/text()").get();
                   pageNum = pageNum.substring(1);         pageNum = pageNum.substring(0, pageNum.length() - 1);         int pageNumInt = Integer.parseInt(pageNum);
          for (int i = 1; i <= pageNumInt; i++) {
                                        String nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index.shtml";                          if (i != 1) {                 nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index_" + i + ".shtml";             }
                           weatherArticleDetailSpider.start(nextUrl);         }     }
      public void start() {         Spider.create(this)                                  .addUrl("https://www.weather.com.cn/index/jqzdtqsj/index.shtml")                 .run();     } }
   | 
 
爬取列表和详情类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
   | package com.sktk.weather.app.spider;
  import com.sktk.weather.app.entity.po.Article; import com.sktk.weather.app.service.IArticleService; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable;
  import javax.annotation.Resource; import java.util.List;
 
 
 
  @Component public class WeatherArticleDetailSpider implements PageProcessor {
      private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
      @Resource     private WeatherArticlePipeline myPipeline;
      @Resource     private IArticleService articleService;
      @Override     public void process(Page page) {
                   List<Selectable> articleList = page.getHtml().xpath("/html/body/div[3]/div[1]/dl").nodes();
                   for (int i = 1; i <= articleList.size(); i++) {                          String title = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/h3/a/text()").toString();             String intro = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/p/text()").toString();
                           Article article = new Article();             article.setTitle(title);             article.setIntro(intro);             articleService.insertOne(article);         }
                   page.addTargetRequests(page.getHtml().links()                 .regex("(https?://www\\.weather\\.com\\.cn/index/\\d{4}/\\d{2}/\\d+\\.shtml)")                 .all());
                   String title = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/p/text()").toString();         String content = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[2]").toString();         String publishTime = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[1]/text()").toString();         String source = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[2]/a/text()").toString();
          page.putField("title", title);         page.putField("content", content);         page.putField("publishTime", publishTime);         page.putField("source", source);
                   if (page.getResultItems().get("title") == null) {             page.setSkip(true);         }     }
      @Override     public Site getSite() {         return site;     }
      public void start(String url) {         Spider.create(this)                 .addUrl(url)                                  .addPipeline(myPipeline)                 .run();     } }
   | 
 
自定义 Pipeline 类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
   | package com.sktk.weather.app.spider;
  import com.sktk.weather.app.entity.po.Article; import com.sktk.weather.app.service.IArticleService; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline;
  import javax.annotation.Resource; import java.util.Map;
 
 
 
  @Slf4j @Component public class WeatherArticlePipeline implements Pipeline {
      @Resource     private IArticleService articleService;
      @Override     public void process(ResultItems resultItems, Task task) {
          Article article = new Article();
          for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
              if ("title".equals(entry.getKey())) {                 article.setTitle(entry.getValue().toString());             }
              if ("content".equals(entry.getKey())) {                 article.setContent(entry.getValue().toString());             }
              if ("publishTime".equals(entry.getKey())) {                 article.setPublishTime(entry.getValue().toString());             }
              if ("source".equals(entry.getKey())) {                 article.setSource(entry.getValue().toString());             }         }
          articleService.insertOne(article);     } }
   | 
 
定时任务类
1 2 3 4 5 6 7 8 9 10 11 12 13 14
   | @Component public class DailySyncArticleTask {
      @Resource     private WeatherArticleSpider weatherArticleSpider;
      
 
      @Scheduled(cron = "0 0 23 * * *")     public void execute() {         weatherArticleSpider.start();     } }
   | 
 
结语
至此,一个用 Java 写的爬取文章列表+分页+详情的爬虫实践就结束了。
参考资料
webmagic爬取分页列表数据