Commit 4dad209648c4284dad1ddd9e122eea70be37f439
1 parent
a3142deb
2022年11月2日15:13:39 优化
Showing
4 changed files
with
76 additions
and
26 deletions
src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java
... | ... | @@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy { |
56 | 56 | private final LivePoolDao livePoolDao; |
57 | 57 | private final AccountDao accountDao; |
58 | 58 | private final Common common; |
59 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
59 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
60 | 60 | private final AccountService accountService; |
61 | 61 | |
62 | 62 | @Override |
... | ... | @@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy { |
206 | 206 | sb.setLength(0); |
207 | 207 | }); |
208 | 208 | try { |
209 | - if (videoPoolList.size() > 0) { | |
209 | + if (!videoPoolList.isEmpty()) { | |
210 | 210 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
211 | 211 | } else { |
212 | 212 | final VideoPool nullVideo = VideoPool.builder() |
... | ... | @@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy { |
394 | 394 | .build()); |
395 | 395 | }); |
396 | 396 | try { |
397 | - if (livePoolList.size() > 0) { | |
397 | + if (!livePoolList.isEmpty()) { | |
398 | 398 | livePoolDao.saveAll(livePoolList.stream() |
399 | 399 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
400 | 400 | .collect(Collectors.toList()) | ... | ... |
src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java
... | ... | @@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy { |
56 | 56 | private final LivePoolDao livePoolDao; |
57 | 57 | private final AccountDao accountDao; |
58 | 58 | private final Common common; |
59 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
59 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
60 | 60 | private final AccountService accountService; |
61 | 61 | |
62 | 62 | /** |
... | ... | @@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy { |
231 | 231 | .build()); |
232 | 232 | }); |
233 | 233 | try { |
234 | - if (videoPoolList.size() > 0) { | |
234 | + if (!videoPoolList.isEmpty()) { | |
235 | 235 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
236 | 236 | } else { |
237 | 237 | final VideoPool nullVideo = VideoPool.builder() |
... | ... | @@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy { |
353 | 353 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); |
354 | 354 | Date startTime = DateUtil.getThisDayMinTime(previousDay); |
355 | 355 | try { |
356 | - if (livePoolList.size() > 0) { | |
356 | + if (!livePoolList.isEmpty()) { | |
357 | 357 | livePoolDao.saveAll(livePoolList.stream() |
358 | 358 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
359 | 359 | .collect(Collectors.toList()) | ... | ... |
src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java
... | ... | @@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy { |
54 | 54 | private final LiveOverviewDao liveDataDao; |
55 | 55 | private final LivePoolDao livePoolDao; |
56 | 56 | private final AccountDao accountDao; |
57 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
57 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
58 | 58 | private final Common common; |
59 | 59 | private final AccountService accountService; |
60 | 60 | private final int WAIT_SECONDS = 5; |
... | ... | @@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy { |
233 | 233 | } |
234 | 234 | }); |
235 | 235 | try { |
236 | - if (videoPoolList.size() > 0) { | |
236 | + if (!videoPoolList.isEmpty()) { | |
237 | 237 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
238 | 238 | } else { |
239 | 239 | videoPoolDao.save(VideoPool.builder() |
... | ... | @@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy { |
389 | 389 | throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常"); |
390 | 390 | } |
391 | 391 | try { |
392 | - if (videoPoolList.size() > 0) { | |
392 | + if (!videoPoolList.isEmpty()) { | |
393 | 393 | videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 |
394 | 394 | } else { |
395 | 395 | videoPoolDao.save(VideoPool.builder() |
... | ... | @@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy { |
608 | 608 | Date endTime = DateUtil.getThisDayMaxTime(previousDay); |
609 | 609 | Date startTime = DateUtil.getThisDayMinTime(previousDay); |
610 | 610 | try { |
611 | - if (livePoolList.size() > 0) { | |
611 | + if (!livePoolList.isEmpty()) { | |
612 | 612 | livePoolDao.saveAll(livePoolList.stream() |
613 | 613 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
614 | 614 | .collect(Collectors.toList()) | ... | ... |
src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java
... | ... | @@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
80 | 80 | private final LivePoolDao livePoolDao; |
81 | 81 | private final AccountDao accountDao; |
82 | 82 | private final Common common; |
83 | - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
83 | + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); | |
84 | 84 | private final AccountService accountService; |
85 | 85 | private final String playbackBaseUrl = "https://live.kuaishou.com/playback/"; |
86 | 86 | private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>(); |
... | ... | @@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
327 | 327 | total = Objects.requireNonNull(obj).getInteger("totalCount"); |
328 | 328 | videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); |
329 | 329 | page++; |
330 | - JSONArray photoList = obj.getJSONArray("photoList"); | |
330 | + /*JSONArray photoList = obj.getJSONArray("photoList"); | |
331 | 331 | String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); |
332 | 332 | if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) |
333 | - break; | |
333 | + break;*/ | |
334 | 334 | } while (total > 10 * (page - 1)); |
335 | 335 | /*HttpCookies cookies = HttpCookies.custom(); |
336 | 336 | CookieStore cookieStore = new BasicCookieStore(); |
... | ... | @@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
581 | 581 | .build()); |
582 | 582 | }); |
583 | 583 | try { |
584 | - if (livePoolList.size() > 0) { | |
584 | + if (!livePoolList.isEmpty()) { | |
585 | 585 | livePoolDao.saveAll(livePoolList.stream() |
586 | 586 | .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) |
587 | 587 | .collect(Collectors.toList()) |
... | ... | @@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
850 | 850 | } |
851 | 851 | int maxPageNum = 0; |
852 | 852 | if (videoDimension) { |
853 | - List<WebElement> pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> | |
854 | - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表 | |
853 | + List<WebElement> pageLabels = this.findPageLabels(driver);// 获取分页页码标签元素列表 | |
854 | + Map<Integer, WebElement> labelMap = this.processPageElement(pageLabels);// 将标签元素处理成K(页码), V(标签element) | |
855 | 855 | if (!CollectionUtils.isEmpty(pageLabels)) { |
856 | 856 | maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 |
857 | - for (WebElement item : pageLabels) { | |
857 | + for (int i = 0; i < maxPageNum; i++) { | |
858 | 858 | try { |
859 | - /*if (Objects.equals(item.getText(), "...")) { | |
860 | - pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> | |
861 | - driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); | |
862 | - item.click(); | |
863 | - }*/ | |
864 | - item.click(); | |
859 | + WebElement pageLabel = this.findPageLabelFromMap(labelMap, i + 1); | |
860 | + if (Objects.isNull(pageLabel)) { | |
861 | + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(1)); | |
862 | + pageLabels = this.findPageLabels(driver); | |
863 | + labelMap = this.processPageElement(pageLabels); | |
864 | + pageLabel = this.findPageLabelFromMap(labelMap, i + 1); | |
865 | + } | |
866 | + pageLabel.click(); | |
865 | 867 | LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); |
866 | 868 | } catch (Exception e) { |
867 | - // this.exitBrowser(accountNo, uuid); | |
869 | + log.info("异常发生, 信息为: {}", e.getMessage(), e); | |
868 | 870 | } |
869 | 871 | } |
870 | 872 | } |
... | ... | @@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
888 | 890 | } |
889 | 891 | for (int i = 0; i < sigList.size(); i++) { |
890 | 892 | sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); |
891 | - if (sigList.size() < maxPageNum && i > 4) | |
892 | - break; | |
893 | 893 | } |
894 | 894 | } catch (Exception e) { |
895 | 895 | this.exitBrowser(accountNo, uuid); |
... | ... | @@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { |
900 | 900 | } |
901 | 901 | |
902 | 902 | /** |
903 | + * 获取对应页码的WebElement对象 | |
904 | + * | |
905 | + * @param labelMap K(页码), V(标签element) | |
906 | + * @param page 要获取的页码 | |
907 | + * @return 页码对应的WebElement对象 | |
908 | + */ | |
909 | + private WebElement findPageLabelFromMap(Map<Integer, WebElement> labelMap, Integer page) { | |
910 | + WebElement webElement = null; | |
911 | + try { | |
912 | + webElement = labelMap.get(page); | |
913 | + } catch (Exception e) { | |
914 | + log.error(e.getMessage(), e); | |
915 | + } | |
916 | + if (Objects.nonNull(webElement)) | |
917 | + return webElement; | |
918 | + return null; | |
919 | + } | |
920 | + | |
921 | + /** | |
922 | + * 寻找页面页码标签集合 | |
923 | + * | |
924 | + * @param driver 驱动 | |
925 | + * @return 页面页码标签元素 | |
926 | + */ | |
927 | + private List<WebElement> findPageLabels(WebDriver driver) { | |
928 | + return new WebDriverWait(driver, 15, 300).until(driver1 -> | |
929 | + driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); | |
930 | + } | |
931 | + | |
932 | + /** | |
933 | + * 将获取到的页码标签处理成 K(页码), V(标签element) 格式 | |
934 | + * | |
935 | + * @param pageLabels 本次获取到的页码标签 | |
936 | + * @return K(页码), V(标签element) | |
937 | + */ | |
938 | + private Map<Integer, WebElement> processPageElement(List<WebElement> pageLabels) { | |
939 | + HashMap<Integer, WebElement> map = new HashMap<>(); | |
940 | + pageLabels.forEach(item -> { | |
941 | + int pageNum; | |
942 | + try { | |
943 | + pageNum = Integer.parseInt(item.getText()); | |
944 | + } catch (NumberFormatException e) { | |
945 | + pageNum = -1; | |
946 | + } | |
947 | + map.put(pageNum, item); | |
948 | + }); | |
949 | + return map; | |
950 | + } | |
951 | + | |
952 | + /** | |
903 | 953 | * 读取http日志获取数据接口全路径地址 |
904 | 954 | * |
905 | 955 | * @param responseReceived 收到的响应 | ... | ... |