From 4dad209648c4284dad1ddd9e122eea70be37f439 Mon Sep 17 00:00:00 2001 From: 王明元 <97082371@qq.com> Date: Wed, 2 Nov 2022 15:14:31 +0800 Subject: [PATCH] 2022年11月2日15:13:39 优化 --- src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java | 6 +++--- src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java | 6 +++--- src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java | 8 ++++---- src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- 4 files changed, 76 insertions(+), 26 deletions(-) diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java index f686c04..bcbab13 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java @@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy { private final LivePoolDao livePoolDao; private final AccountDao accountDao; private final Common common; - public final static ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); + public static final ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); private final AccountService accountService; @Override @@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy { sb.setLength(0); }); try { - if (videoPoolList.size() > 0) { + if (!videoPoolList.isEmpty()) { videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 } else { final VideoPool nullVideo = VideoPool.builder() @@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy { .build()); }); try { - if (livePoolList.size() > 0) { + if (!livePoolList.isEmpty()) { livePoolDao.saveAll(livePoolList.stream() .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) .collect(Collectors.toList()) diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java index 5c63728..8a67111 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java @@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy { private final LivePoolDao livePoolDao; private final AccountDao accountDao; private final Common common; - public final static ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); + public static final ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); private final AccountService accountService; /** @@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy { .build()); }); try { - if (videoPoolList.size() > 0) { + if (!videoPoolList.isEmpty()) { videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 } else { final VideoPool nullVideo = VideoPool.builder() @@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy { Date endTime = DateUtil.getThisDayMaxTime(previousDay); Date startTime = DateUtil.getThisDayMinTime(previousDay); try { - if (livePoolList.size() > 0) { + if (!livePoolList.isEmpty()) { livePoolDao.saveAll(livePoolList.stream() .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) .collect(Collectors.toList()) diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java index e5ad293..b8363b7 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java @@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy { private final LiveOverviewDao liveDataDao; private final LivePoolDao livePoolDao; private final AccountDao accountDao; - public final static ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); + public static final ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); private final Common common; private final AccountService accountService; private final int WAIT_SECONDS = 5; @@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy { } }); try { - if (videoPoolList.size() > 0) { + if (!videoPoolList.isEmpty()) { videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 } else { videoPoolDao.save(VideoPool.builder() @@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy { throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常"); } try { - if (videoPoolList.size() > 0) { + if (!videoPoolList.isEmpty()) { videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 } else { videoPoolDao.save(VideoPool.builder() @@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy { Date endTime = DateUtil.getThisDayMaxTime(previousDay); Date startTime = DateUtil.getThisDayMinTime(previousDay); try { - if (livePoolList.size() > 0) { + if (!livePoolList.isEmpty()) { livePoolDao.saveAll(livePoolList.stream() .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) .collect(Collectors.toList()) diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java index fe9319b..855c23d 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java @@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { private final LivePoolDao livePoolDao; private final AccountDao accountDao; private final Common common; - public final static ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); + public static final ConcurrentHashMap DRIVER_MAP = new ConcurrentHashMap<>(); private final AccountService accountService; private final String playbackBaseUrl = "https://live.kuaishou.com/playback/"; private final ConcurrentHashMap sig3Map = new ConcurrentHashMap<>(); @@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { total = Objects.requireNonNull(obj).getInteger("totalCount"); videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); page++; - JSONArray photoList = obj.getJSONArray("photoList"); + /*JSONArray photoList = obj.getJSONArray("photoList"); String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) - break; + break;*/ } while (total > 10 * (page - 1)); /*HttpCookies cookies = HttpCookies.custom(); CookieStore cookieStore = new BasicCookieStore(); @@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { .build()); }); try { - if (livePoolList.size() > 0) { + if (!livePoolList.isEmpty()) { livePoolDao.saveAll(livePoolList.stream() .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) .collect(Collectors.toList()) @@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { } int maxPageNum = 0; if (videoDimension) { - List pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表 + List pageLabels = this.findPageLabels(driver);// 获取分页页码标签元素列表 + Map labelMap = this.processPageElement(pageLabels);// 将标签元素处理成K(页码), V(标签element) if (!CollectionUtils.isEmpty(pageLabels)) { maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 - for (WebElement item : pageLabels) { + for (int i = 0; i < maxPageNum; i++) { try { - /*if (Objects.equals(item.getText(), "...")) { - pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> - driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); - item.click(); - }*/ - item.click(); + WebElement pageLabel = this.findPageLabelFromMap(labelMap, i + 1); + if (Objects.isNull(pageLabel)) { + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(1)); + pageLabels = this.findPageLabels(driver); + labelMap = this.processPageElement(pageLabels); + pageLabel = this.findPageLabelFromMap(labelMap, i + 1); + } + pageLabel.click(); LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); } catch (Exception e) { - // this.exitBrowser(accountNo, uuid); + log.info("异常发生, 信息为: {}", e.getMessage(), e); } } } @@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { } for (int i = 0; i < sigList.size(); i++) { sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); - if (sigList.size() < maxPageNum && i > 4) - break; } } catch (Exception e) { this.exitBrowser(accountNo, uuid); @@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { } /** + * 获取对应页码的WebElement对象 + * + * @param labelMap K(页码), V(标签element) + * @param page 要获取的页码 + * @return 页码对应的WebElement对象 + */ + private WebElement findPageLabelFromMap(Map labelMap, Integer page) { + WebElement webElement = null; + try { + webElement = labelMap.get(page); + } catch (Exception e) { + log.error(e.getMessage(), e); + } + if (Objects.nonNull(webElement)) + return webElement; + return null; + } + + /** + * 寻找页面页码标签集合 + * + * @param driver 驱动 + * @return 页面页码标签元素 + */ + private List findPageLabels(WebDriver driver) { + return new WebDriverWait(driver, 15, 300).until(driver1 -> + driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); + } + + /** + * 将获取到的页码标签处理成 K(页码), V(标签element) 格式 + * + * @param pageLabels 本次获取到的页码标签 + * @return K(页码), V(标签element) + */ + private Map processPageElement(List pageLabels) { + HashMap map = new HashMap<>(); + pageLabels.forEach(item -> { + int pageNum; + try { + pageNum = Integer.parseInt(item.getText()); + } catch (NumberFormatException e) { + pageNum = -1; + } + map.put(pageNum, item); + }); + return map; + } + + /** * 读取http日志获取数据接口全路径地址 * * @param responseReceived 收到的响应 -- libgit2 0.22.2