Commit 4dad209648c4284dad1ddd9e122eea70be37f439
1 parent
a3142deb
2022年11月2日15:13:39 优化
Showing
4 changed files
with
76 additions
and
26 deletions
src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java
@@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy { | @@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy { | ||
56 | private final LivePoolDao livePoolDao; | 56 | private final LivePoolDao livePoolDao; |
57 | private final AccountDao accountDao; | 57 | private final AccountDao accountDao; |
58 | private final Common common; | 58 | private final Common common; |
59 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | 59 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); |
60 | private final AccountService accountService; | 60 | private final AccountService accountService; |
61 | 61 | ||
62 | @Override | 62 | @Override |
@@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy { | @@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy { | ||
206 | sb.setLength(0); | 206 | sb.setLength(0); |
207 | }); | 207 | }); |
208 | try { | 208 | try { |
209 | - if (videoPoolList.size() > 0) { | 209 | + if (!videoPoolList.isEmpty()) { |
210 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 | 210 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
211 | } else { | 211 | } else { |
212 | final VideoPool nullVideo = VideoPool.builder() | 212 | final VideoPool nullVideo = VideoPool.builder() |
@@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy { | @@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy { | ||
394 | .build()); | 394 | .build()); |
395 | }); | 395 | }); |
396 | try { | 396 | try { |
397 | - if (livePoolList.size() > 0) { | 397 | + if (!livePoolList.isEmpty()) { |
398 | livePoolDao.saveAll(livePoolList.stream() | 398 | livePoolDao.saveAll(livePoolList.stream() |
399 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) | 399 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
400 | .collect(Collectors.toList()) | 400 | .collect(Collectors.toList()) |
src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java
@@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | @@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | ||
56 | private final LivePoolDao livePoolDao; | 56 | private final LivePoolDao livePoolDao; |
57 | private final AccountDao accountDao; | 57 | private final AccountDao accountDao; |
58 | private final Common common; | 58 | private final Common common; |
59 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | 59 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); |
60 | private final AccountService accountService; | 60 | private final AccountService accountService; |
61 | 61 | ||
62 | /** | 62 | /** |
@@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | @@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | ||
231 | .build()); | 231 | .build()); |
232 | }); | 232 | }); |
233 | try { | 233 | try { |
234 | - if (videoPoolList.size() > 0) { | 234 | + if (!videoPoolList.isEmpty()) { |
235 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 | 235 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
236 | } else { | 236 | } else { |
237 | final VideoPool nullVideo = VideoPool.builder() | 237 | final VideoPool nullVideo = VideoPool.builder() |
@@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | @@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy { | ||
353 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); | 353 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); |
354 | Date startTime = DateUtil.getThisDayMinTime(previousDay); | 354 | Date startTime = DateUtil.getThisDayMinTime(previousDay); |
355 | try { | 355 | try { |
356 | - if (livePoolList.size() > 0) { | 356 | + if (!livePoolList.isEmpty()) { |
357 | livePoolDao.saveAll(livePoolList.stream() | 357 | livePoolDao.saveAll(livePoolList.stream() |
358 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) | 358 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
359 | .collect(Collectors.toList()) | 359 | .collect(Collectors.toList()) |
src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java
@@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy { | @@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy { | ||
54 | private final LiveOverviewDao liveDataDao; | 54 | private final LiveOverviewDao liveDataDao; |
55 | private final LivePoolDao livePoolDao; | 55 | private final LivePoolDao livePoolDao; |
56 | private final AccountDao accountDao; | 56 | private final AccountDao accountDao; |
57 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | 57 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); |
58 | private final Common common; | 58 | private final Common common; |
59 | private final AccountService accountService; | 59 | private final AccountService accountService; |
60 | private final int WAIT_SECONDS = 5; | 60 | private final int WAIT_SECONDS = 5; |
@@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy { | @@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy { | ||
233 | } | 233 | } |
234 | }); | 234 | }); |
235 | try { | 235 | try { |
236 | - if (videoPoolList.size() > 0) { | 236 | + if (!videoPoolList.isEmpty()) { |
237 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 | 237 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
238 | } else { | 238 | } else { |
239 | videoPoolDao.save(VideoPool.builder() | 239 | videoPoolDao.save(VideoPool.builder() |
@@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy { | @@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy { | ||
389 | throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常"); | 389 | throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常"); |
390 | } | 390 | } |
391 | try { | 391 | try { |
392 | - if (videoPoolList.size() > 0) { | 392 | + if (!videoPoolList.isEmpty()) { |
393 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 | 393 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
394 | } else { | 394 | } else { |
395 | videoPoolDao.save(VideoPool.builder() | 395 | videoPoolDao.save(VideoPool.builder() |
@@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy { | @@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy { | ||
608 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); | 608 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); |
609 | Date startTime = DateUtil.getThisDayMinTime(previousDay); | 609 | Date startTime = DateUtil.getThisDayMinTime(previousDay); |
610 | try { | 610 | try { |
611 | - if (livePoolList.size() > 0) { | 611 | + if (!livePoolList.isEmpty()) { |
612 | livePoolDao.saveAll(livePoolList.stream() | 612 | livePoolDao.saveAll(livePoolList.stream() |
613 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) | 613 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
614 | .collect(Collectors.toList()) | 614 | .collect(Collectors.toList()) |
src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java
@@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
80 | private final LivePoolDao livePoolDao; | 80 | private final LivePoolDao livePoolDao; |
81 | private final AccountDao accountDao; | 81 | private final AccountDao accountDao; |
82 | private final Common common; | 82 | private final Common common; |
83 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | 83 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); |
84 | private final AccountService accountService; | 84 | private final AccountService accountService; |
85 | private final String playbackBaseUrl = "https://live.kuaishou.com/playback/"; | 85 | private final String playbackBaseUrl = "https://live.kuaishou.com/playback/"; |
86 | private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>(); | 86 | private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>(); |
@@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
327 | total = Objects.requireNonNull(obj).getInteger("totalCount"); | 327 | total = Objects.requireNonNull(obj).getInteger("totalCount"); |
328 | videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); | 328 | videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); |
329 | page++; | 329 | page++; |
330 | - JSONArray photoList = obj.getJSONArray("photoList"); | 330 | + /*JSONArray photoList = obj.getJSONArray("photoList"); |
331 | String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); | 331 | String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); |
332 | if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) | 332 | if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) |
333 | - break; | 333 | + break;*/ |
334 | } while (total > 10 * (page - 1)); | 334 | } while (total > 10 * (page - 1)); |
335 | /*HttpCookies cookies = HttpCookies.custom(); | 335 | /*HttpCookies cookies = HttpCookies.custom(); |
336 | CookieStore cookieStore = new BasicCookieStore(); | 336 | CookieStore cookieStore = new BasicCookieStore(); |
@@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
581 | .build()); | 581 | .build()); |
582 | }); | 582 | }); |
583 | try { | 583 | try { |
584 | - if (livePoolList.size() > 0) { | 584 | + if (!livePoolList.isEmpty()) { |
585 | livePoolDao.saveAll(livePoolList.stream() | 585 | livePoolDao.saveAll(livePoolList.stream() |
586 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) | 586 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
587 | .collect(Collectors.toList()) | 587 | .collect(Collectors.toList()) |
@@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
850 | } | 850 | } |
851 | int maxPageNum = 0; | 851 | int maxPageNum = 0; |
852 | if (videoDimension) { | 852 | if (videoDimension) { |
853 | - List<WebElement> pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> | ||
854 | - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表 | 853 | + List<WebElement> pageLabels = this.findPageLabels(driver);// 获取分页页码标签元素列表 |
854 | + Map<Integer, WebElement> labelMap = this.processPageElement(pageLabels);// 将标签元素处理成K(页码), V(标签element) | ||
855 | if (!CollectionUtils.isEmpty(pageLabels)) { | 855 | if (!CollectionUtils.isEmpty(pageLabels)) { |
856 | maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 | 856 | maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 |
857 | - for (WebElement item : pageLabels) { | 857 | + for (int i = 0; i < maxPageNum; i++) { |
858 | try { | 858 | try { |
859 | - /*if (Objects.equals(item.getText(), "...")) { | ||
860 | - pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> | ||
861 | - driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); | ||
862 | - item.click(); | ||
863 | - }*/ | ||
864 | - item.click(); | 859 | + WebElement pageLabel = this.findPageLabelFromMap(labelMap, i + 1); |
860 | + if (Objects.isNull(pageLabel)) { | ||
861 | + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(1)); | ||
862 | + pageLabels = this.findPageLabels(driver); | ||
863 | + labelMap = this.processPageElement(pageLabels); | ||
864 | + pageLabel = this.findPageLabelFromMap(labelMap, i + 1); | ||
865 | + } | ||
866 | + pageLabel.click(); | ||
865 | LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); | 867 | LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); |
866 | } catch (Exception e) { | 868 | } catch (Exception e) { |
867 | - // this.exitBrowser(accountNo, uuid); | 869 | + log.info("异常发生, 信息为: {}", e.getMessage(), e); |
868 | } | 870 | } |
869 | } | 871 | } |
870 | } | 872 | } |
@@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
888 | } | 890 | } |
889 | for (int i = 0; i < sigList.size(); i++) { | 891 | for (int i = 0; i < sigList.size(); i++) { |
890 | sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); | 892 | sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); |
891 | - if (sigList.size() < maxPageNum && i > 4) | ||
892 | - break; | ||
893 | } | 893 | } |
894 | } catch (Exception e) { | 894 | } catch (Exception e) { |
895 | this.exitBrowser(accountNo, uuid); | 895 | this.exitBrowser(accountNo, uuid); |
@@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | @@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { | ||
900 | } | 900 | } |
901 | 901 | ||
902 | /** | 902 | /** |
903 | + * 获取对应页码的WebElement对象 | ||
904 | + * | ||
905 | + * @param labelMap K(页码), V(标签element) | ||
906 | + * @param page 要获取的页码 | ||
907 | + * @return 页码对应的WebElement对象 | ||
908 | + */ | ||
909 | + private WebElement findPageLabelFromMap(Map<Integer, WebElement> labelMap, Integer page) { | ||
910 | + WebElement webElement = null; | ||
911 | + try { | ||
912 | + webElement = labelMap.get(page); | ||
913 | + } catch (Exception e) { | ||
914 | + log.error(e.getMessage(), e); | ||
915 | + } | ||
916 | + if (Objects.nonNull(webElement)) | ||
917 | + return webElement; | ||
918 | + return null; | ||
919 | + } | ||
920 | + | ||
921 | + /** | ||
922 | + * 寻找页面页码标签集合 | ||
923 | + * | ||
924 | + * @param driver 驱动 | ||
925 | + * @return 页面页码标签元素 | ||
926 | + */ | ||
927 | + private List<WebElement> findPageLabels(WebDriver driver) { | ||
928 | + return new WebDriverWait(driver, 15, 300).until(driver1 -> | ||
929 | + driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); | ||
930 | + } | ||
931 | + | ||
932 | + /** | ||
933 | + * 将获取到的页码标签处理成 K(页码), V(标签element) 格式 | ||
934 | + * | ||
935 | + * @param pageLabels 本次获取到的页码标签 | ||
936 | + * @return K(页码), V(标签element) | ||
937 | + */ | ||
938 | + private Map<Integer, WebElement> processPageElement(List<WebElement> pageLabels) { | ||
939 | + HashMap<Integer, WebElement> map = new HashMap<>(); | ||
940 | + pageLabels.forEach(item -> { | ||
941 | + int pageNum; | ||
942 | + try { | ||
943 | + pageNum = Integer.parseInt(item.getText()); | ||
944 | + } catch (NumberFormatException e) { | ||
945 | + pageNum = -1; | ||
946 | + } | ||
947 | + map.put(pageNum, item); | ||
948 | + }); | ||
949 | + return map; | ||
950 | + } | ||
951 | + | ||
952 | + /** | ||
903 | * 读取http日志获取数据接口全路径地址 | 953 | * 读取http日志获取数据接口全路径地址 |
904 | * | 954 | * |
905 | * @param responseReceived 收到的响应 | 955 | * @param responseReceived 收到的响应 |