Commit 4dad209648c4284dad1ddd9e122eea70be37f439

Authored by 王明元
1 parent a3142deb

2022年11月2日15:13:39 优化

src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java
@@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy { @@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy {
56 private final LivePoolDao livePoolDao; 56 private final LivePoolDao livePoolDao;
57 private final AccountDao accountDao; 57 private final AccountDao accountDao;
58 private final Common common; 58 private final Common common;
59 - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); 59 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
60 private final AccountService accountService; 60 private final AccountService accountService;
61 61
62 @Override 62 @Override
@@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy { @@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy {
206 sb.setLength(0); 206 sb.setLength(0);
207 }); 207 });
208 try { 208 try {
209 - if (videoPoolList.size() > 0) { 209 + if (!videoPoolList.isEmpty()) {
210 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 210 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
211 } else { 211 } else {
212 final VideoPool nullVideo = VideoPool.builder() 212 final VideoPool nullVideo = VideoPool.builder()
@@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy { @@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy {
394 .build()); 394 .build());
395 }); 395 });
396 try { 396 try {
397 - if (livePoolList.size() > 0) { 397 + if (!livePoolList.isEmpty()) {
398 livePoolDao.saveAll(livePoolList.stream() 398 livePoolDao.saveAll(livePoolList.stream()
399 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) 399 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
400 .collect(Collectors.toList()) 400 .collect(Collectors.toList())
src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java
@@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy { @@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
56 private final LivePoolDao livePoolDao; 56 private final LivePoolDao livePoolDao;
57 private final AccountDao accountDao; 57 private final AccountDao accountDao;
58 private final Common common; 58 private final Common common;
59 - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); 59 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
60 private final AccountService accountService; 60 private final AccountService accountService;
61 61
62 /** 62 /**
@@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy { @@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
231 .build()); 231 .build());
232 }); 232 });
233 try { 233 try {
234 - if (videoPoolList.size() > 0) { 234 + if (!videoPoolList.isEmpty()) {
235 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 235 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
236 } else { 236 } else {
237 final VideoPool nullVideo = VideoPool.builder() 237 final VideoPool nullVideo = VideoPool.builder()
@@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy { @@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
353 Date endTime = DateUtil.getThisDayMaxTime(previousDay); 353 Date endTime = DateUtil.getThisDayMaxTime(previousDay);
354 Date startTime = DateUtil.getThisDayMinTime(previousDay); 354 Date startTime = DateUtil.getThisDayMinTime(previousDay);
355 try { 355 try {
356 - if (livePoolList.size() > 0) { 356 + if (!livePoolList.isEmpty()) {
357 livePoolDao.saveAll(livePoolList.stream() 357 livePoolDao.saveAll(livePoolList.stream()
358 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) 358 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
359 .collect(Collectors.toList()) 359 .collect(Collectors.toList())
src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java
@@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy { @@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy {
54 private final LiveOverviewDao liveDataDao; 54 private final LiveOverviewDao liveDataDao;
55 private final LivePoolDao livePoolDao; 55 private final LivePoolDao livePoolDao;
56 private final AccountDao accountDao; 56 private final AccountDao accountDao;
57 - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); 57 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
58 private final Common common; 58 private final Common common;
59 private final AccountService accountService; 59 private final AccountService accountService;
60 private final int WAIT_SECONDS = 5; 60 private final int WAIT_SECONDS = 5;
@@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy { @@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy {
233 } 233 }
234 }); 234 });
235 try { 235 try {
236 - if (videoPoolList.size() > 0) { 236 + if (!videoPoolList.isEmpty()) {
237 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 237 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
238 } else { 238 } else {
239 videoPoolDao.save(VideoPool.builder() 239 videoPoolDao.save(VideoPool.builder()
@@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy { @@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy {
389 throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常"); 389 throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常");
390 } 390 }
391 try { 391 try {
392 - if (videoPoolList.size() > 0) { 392 + if (!videoPoolList.isEmpty()) {
393 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 393 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
394 } else { 394 } else {
395 videoPoolDao.save(VideoPool.builder() 395 videoPoolDao.save(VideoPool.builder()
@@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy { @@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy {
608 Date endTime = DateUtil.getThisDayMaxTime(previousDay); 608 Date endTime = DateUtil.getThisDayMaxTime(previousDay);
609 Date startTime = DateUtil.getThisDayMinTime(previousDay); 609 Date startTime = DateUtil.getThisDayMinTime(previousDay);
610 try { 610 try {
611 - if (livePoolList.size() > 0) { 611 + if (!livePoolList.isEmpty()) {
612 livePoolDao.saveAll(livePoolList.stream() 612 livePoolDao.saveAll(livePoolList.stream()
613 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) 613 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
614 .collect(Collectors.toList()) 614 .collect(Collectors.toList())
src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java
@@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
80 private final LivePoolDao livePoolDao; 80 private final LivePoolDao livePoolDao;
81 private final AccountDao accountDao; 81 private final AccountDao accountDao;
82 private final Common common; 82 private final Common common;
83 - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>(); 83 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
84 private final AccountService accountService; 84 private final AccountService accountService;
85 private final String playbackBaseUrl = "https://live.kuaishou.com/playback/"; 85 private final String playbackBaseUrl = "https://live.kuaishou.com/playback/";
86 private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>(); 86 private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>();
@@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
327 total = Objects.requireNonNull(obj).getInteger("totalCount"); 327 total = Objects.requireNonNull(obj).getInteger("totalCount");
328 videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); 328 videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray()));
329 page++; 329 page++;
330 - JSONArray photoList = obj.getJSONArray("photoList"); 330 + /*JSONArray photoList = obj.getJSONArray("photoList");
331 String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); 331 String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime");
332 if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) 332 if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0)
333 - break; 333 + break;*/
334 } while (total > 10 * (page - 1)); 334 } while (total > 10 * (page - 1));
335 /*HttpCookies cookies = HttpCookies.custom(); 335 /*HttpCookies cookies = HttpCookies.custom();
336 CookieStore cookieStore = new BasicCookieStore(); 336 CookieStore cookieStore = new BasicCookieStore();
@@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
581 .build()); 581 .build());
582 }); 582 });
583 try { 583 try {
584 - if (livePoolList.size() > 0) { 584 + if (!livePoolList.isEmpty()) {
585 livePoolDao.saveAll(livePoolList.stream() 585 livePoolDao.saveAll(livePoolList.stream()
586 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0) 586 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
587 .collect(Collectors.toList()) 587 .collect(Collectors.toList())
@@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
850 } 850 }
851 int maxPageNum = 0; 851 int maxPageNum = 0;
852 if (videoDimension) { 852 if (videoDimension) {
853 - List<WebElement> pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 ->  
854 - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表 853 + List<WebElement> pageLabels = this.findPageLabels(driver);// 获取分页页码标签元素列表
  854 + Map<Integer, WebElement> labelMap = this.processPageElement(pageLabels);// 将标签元素处理成K(页码), V(标签element)
855 if (!CollectionUtils.isEmpty(pageLabels)) { 855 if (!CollectionUtils.isEmpty(pageLabels)) {
856 maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 856 maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码
857 - for (WebElement item : pageLabels) { 857 + for (int i = 0; i < maxPageNum; i++) {
858 try { 858 try {
859 - /*if (Objects.equals(item.getText(), "...")) {  
860 - pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 ->  
861 - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));  
862 - item.click();  
863 - }*/  
864 - item.click(); 859 + WebElement pageLabel = this.findPageLabelFromMap(labelMap, i + 1);
  860 + if (Objects.isNull(pageLabel)) {
  861 + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(1));
  862 + pageLabels = this.findPageLabels(driver);
  863 + labelMap = this.processPageElement(pageLabels);
  864 + pageLabel = this.findPageLabelFromMap(labelMap, i + 1);
  865 + }
  866 + pageLabel.click();
865 LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); 867 LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2));
866 } catch (Exception e) { 868 } catch (Exception e) {
867 - // this.exitBrowser(accountNo, uuid); 869 + log.info("异常发生, 信息为: {}", e.getMessage(), e);
868 } 870 }
869 } 871 }
870 } 872 }
@@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
888 } 890 }
889 for (int i = 0; i < sigList.size(); i++) { 891 for (int i = 0; i < sigList.size(); i++) {
890 sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); 892 sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i));
891 - if (sigList.size() < maxPageNum && i > 4)  
892 - break;  
893 } 893 }
894 } catch (Exception e) { 894 } catch (Exception e) {
895 this.exitBrowser(accountNo, uuid); 895 this.exitBrowser(accountNo, uuid);
@@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { @@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
900 } 900 }
901 901
902 /** 902 /**
  903 + * 获取对应页码的WebElement对象
  904 + *
  905 + * @param labelMap K(页码), V(标签element)
  906 + * @param page 要获取的页码
  907 + * @return 页码对应的WebElement对象
  908 + */
  909 + private WebElement findPageLabelFromMap(Map<Integer, WebElement> labelMap, Integer page) {
  910 + WebElement webElement = null;
  911 + try {
  912 + webElement = labelMap.get(page);
  913 + } catch (Exception e) {
  914 + log.error(e.getMessage(), e);
  915 + }
  916 + if (Objects.nonNull(webElement))
  917 + return webElement;
  918 + return null;
  919 + }
  920 +
  921 + /**
  922 + * 寻找页面页码标签集合
  923 + *
  924 + * @param driver 驱动
  925 + * @return 页面页码标签元素
  926 + */
  927 + private List<WebElement> findPageLabels(WebDriver driver) {
  928 + return new WebDriverWait(driver, 15, 300).until(driver1 ->
  929 + driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));
  930 + }
  931 +
  932 + /**
  933 + * 将获取到的页码标签处理成 K(页码), V(标签element) 格式
  934 + *
  935 + * @param pageLabels 本次获取到的页码标签
  936 + * @return K(页码), V(标签element)
  937 + */
  938 + private Map<Integer, WebElement> processPageElement(List<WebElement> pageLabels) {
  939 + HashMap<Integer, WebElement> map = new HashMap<>();
  940 + pageLabels.forEach(item -> {
  941 + int pageNum;
  942 + try {
  943 + pageNum = Integer.parseInt(item.getText());
  944 + } catch (NumberFormatException e) {
  945 + pageNum = -1;
  946 + }
  947 + map.put(pageNum, item);
  948 + });
  949 + return map;
  950 + }
  951 +
  952 + /**
903 * 读取http日志获取数据接口全路径地址 953 * 读取http日志获取数据接口全路径地址
904 * 954 *
905 * @param responseReceived 收到的响应 955 * @param responseReceived 收到的响应