前言

有个需求,需要抓取天气的资讯文章,而该项目是用 Java 写的,由于爬虫需求也较小,所以就索性直接用 Java 来爬。

Java 有个包是 webmagic 本次就是用的这个包做的实践。

我们要采集的网址是 https://www.weather.com.cn/index/jqzdtqsj/index.shtml 打开这个网址可以看到是个列表,并且有分页。

配置 Maven 依赖

1
2
3
4
5
6
7
8
9
10
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.10.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.10.3</version>
</dependency>

爬虫启动入口类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package com.sktk.weather.app.spider;

import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import javax.annotation.Resource;

/**
* @author yang
*/
@Component
@Slf4j
public class WeatherArticleSpider implements PageProcessor {

@Resource
private WeatherArticleDetailSpider weatherArticleDetailSpider;

@Override
public void process(Page page) {

// 使用 xpath 获取到总页数,取到的是 “共2页”
String pageNum = page.getHtml().xpath("/html/body/div[3]/div[1]/div/div/span[3]/text()").get();

// 取到总页数
pageNum = pageNum.substring(1);
pageNum = pageNum.substring(0, pageNum.length() - 1);
int pageNumInt = Integer.parseInt(pageNum);

for (int i = 1; i <= pageNumInt; i++) {

// 获取每一页的链接,将当前页数拼接到url上
// 第一页的链接是这个
String nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index.shtml";
// 后面页的链接需要拼起来
if (i != 1) {
nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index_" + i + ".shtml";
}

// 开始爬每一页的数据
weatherArticleDetailSpider.start(nextUrl);
}
}

public void start() {
Spider.create(this)
// 添加初始爬取页面链接
.addUrl("https://www.weather.com.cn/index/jqzdtqsj/index.shtml")
.run();
}
}

爬取列表和详情类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package com.sktk.weather.app.spider;

import com.sktk.weather.app.entity.po.Article;
import com.sktk.weather.app.service.IArticleService;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import javax.annotation.Resource;
import java.util.List;

/**
* @author yang
*/
@Component
public class WeatherArticleDetailSpider implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

@Resource
private WeatherArticlePipeline myPipeline;

@Resource
private IArticleService articleService;

@Override
public void process(Page page) {

// 先取到文章列表
List<Selectable> articleList = page.getHtml().xpath("/html/body/div[3]/div[1]/dl").nodes();

// 循环列表
for (int i = 1; i <= articleList.size(); i++) {
// 取到列表中的标题和简介
String title = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/h3/a/text()").toString();
String intro = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/p/text()").toString();

// 存表
Article article = new Article();
article.setTitle(title);
article.setIntro(intro);
articleService.insertOne(article);
}

// 正则匹配到该页面中的所有文章链接并获取内容
page.addTargetRequests(page.getHtml().links()
.regex("(https?://www\\.weather\\.com\\.cn/index/\\d{4}/\\d{2}/\\d+\\.shtml)")
.all());

// 通过xpath取到所有想要的数据
String title = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/p/text()").toString();
String content = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[2]").toString();
String publishTime = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[1]/text()").toString();
String source = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[2]/a/text()").toString();

page.putField("title", title);
page.putField("content", content);
page.putField("publishTime", publishTime);
page.putField("source", source);

// 如果页面中的title未取到的话则跳过
if (page.getResultItems().get("title") == null) {
page.setSkip(true);
}
}

@Override
public Site getSite() {
return site;
}

public void start(String url) {
Spider.create(this)
.addUrl(url)
// 使用 Pipeline 保存每条数据结果
.addPipeline(myPipeline)
.run();
}
}

自定义 Pipeline 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
package com.sktk.weather.app.spider;

import com.sktk.weather.app.entity.po.Article;
import com.sktk.weather.app.service.IArticleService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import javax.annotation.Resource;
import java.util.Map;

/**
* @author yang
*/
@Slf4j
@Component
public class WeatherArticlePipeline implements Pipeline {

@Resource
private IArticleService articleService;

@Override
public void process(ResultItems resultItems, Task task) {

Article article = new Article();

for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {

if ("title".equals(entry.getKey())) {
article.setTitle(entry.getValue().toString());
}

if ("content".equals(entry.getKey())) {
article.setContent(entry.getValue().toString());
}

if ("publishTime".equals(entry.getKey())) {
article.setPublishTime(entry.getValue().toString());
}

if ("source".equals(entry.getKey())) {
article.setSource(entry.getValue().toString());
}
}

articleService.insertOne(article);
}
}

定时任务类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
@Component
public class DailySyncArticleTask {

@Resource
private WeatherArticleSpider weatherArticleSpider;

/**
* 定时每天晚上11点执行同步天气资讯
*/
@Scheduled(cron = "0 0 23 * * *")
public void execute() {
weatherArticleSpider.start();
}
}

结语

至此,一个用 Java 写的爬取文章列表+分页+详情的爬虫实践就结束了。

参考资料

webmagic爬取分页列表数据