前言
有个需求,需要抓取天气的资讯文章,而该项目是用 Java 写的,由于爬虫需求也较小,所以就索性直接用 Java 来爬。
Java 有个包是 webmagic 本次就是用的这个包做的实践。
我们要采集的网址是 https://www.weather.com.cn/index/jqzdtqsj/index.shtml 打开这个网址可以看到是个列表,并且有分页。
配置 Maven 依赖
1 2 3 4 5 6 7 8 9 10
| <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.10.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.10.3</version> </dependency>
|
爬虫启动入口类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| package com.sktk.weather.app.spider;
import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor;
import javax.annotation.Resource;
@Component @Slf4j public class WeatherArticleSpider implements PageProcessor {
@Resource private WeatherArticleDetailSpider weatherArticleDetailSpider;
@Override public void process(Page page) {
String pageNum = page.getHtml().xpath("/html/body/div[3]/div[1]/div/div/span[3]/text()").get();
pageNum = pageNum.substring(1); pageNum = pageNum.substring(0, pageNum.length() - 1); int pageNumInt = Integer.parseInt(pageNum);
for (int i = 1; i <= pageNumInt; i++) {
String nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index.shtml"; if (i != 1) { nextUrl = "https://www.weather.com.cn/index/jqzdtqsj/index_" + i + ".shtml"; }
weatherArticleDetailSpider.start(nextUrl); } }
public void start() { Spider.create(this) .addUrl("https://www.weather.com.cn/index/jqzdtqsj/index.shtml") .run(); } }
|
爬取列表和详情类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| package com.sktk.weather.app.spider;
import com.sktk.weather.app.entity.po.Article; import com.sktk.weather.app.service.IArticleService; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable;
import javax.annotation.Resource; import java.util.List;
@Component public class WeatherArticleDetailSpider implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Resource private WeatherArticlePipeline myPipeline;
@Resource private IArticleService articleService;
@Override public void process(Page page) {
List<Selectable> articleList = page.getHtml().xpath("/html/body/div[3]/div[1]/dl").nodes();
for (int i = 1; i <= articleList.size(); i++) { String title = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/h3/a/text()").toString(); String intro = page.getHtml().xpath("/html/body/div[3]/div[1]/dl[" + i + "]/dd/p/text()").toString();
Article article = new Article(); article.setTitle(title); article.setIntro(intro); articleService.insertOne(article); }
page.addTargetRequests(page.getHtml().links() .regex("(https?://www\\.weather\\.com\\.cn/index/\\d{4}/\\d{2}/\\d+\\.shtml)") .all());
String title = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/p/text()").toString(); String content = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[2]").toString(); String publishTime = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[1]/text()").toString(); String source = page.getHtml().xpath("/html/body/div[4]/div/div[1]/div[1]/div[1]/div[1]/div[1]/span[2]/a/text()").toString();
page.putField("title", title); page.putField("content", content); page.putField("publishTime", publishTime); page.putField("source", source);
if (page.getResultItems().get("title") == null) { page.setSkip(true); } }
@Override public Site getSite() { return site; }
public void start(String url) { Spider.create(this) .addUrl(url) .addPipeline(myPipeline) .run(); } }
|
自定义 Pipeline 类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| package com.sktk.weather.app.spider;
import com.sktk.weather.app.entity.po.Article; import com.sktk.weather.app.service.IArticleService; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline;
import javax.annotation.Resource; import java.util.Map;
@Slf4j @Component public class WeatherArticlePipeline implements Pipeline {
@Resource private IArticleService articleService;
@Override public void process(ResultItems resultItems, Task task) {
Article article = new Article();
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if ("title".equals(entry.getKey())) { article.setTitle(entry.getValue().toString()); }
if ("content".equals(entry.getKey())) { article.setContent(entry.getValue().toString()); }
if ("publishTime".equals(entry.getKey())) { article.setPublishTime(entry.getValue().toString()); }
if ("source".equals(entry.getKey())) { article.setSource(entry.getValue().toString()); } }
articleService.insertOne(article); } }
|
定时任务类
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| @Component public class DailySyncArticleTask {
@Resource private WeatherArticleSpider weatherArticleSpider;
@Scheduled(cron = "0 0 23 * * *") public void execute() { weatherArticleSpider.start(); } }
|
结语
至此,一个用 Java 写的爬取文章列表+分页+详情的爬虫实践就结束了。
参考资料
webmagic爬取分页列表数据