Commit 4dad209648c4284dad1ddd9e122eea70be37f439

Authored by 王明元
1 parent a3142deb

2022年11月2日15:13:39 优化

src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java
... ... @@ -56,7 +56,7 @@ public class BilibiliCrawl implements CrawlStrategy {
56 56 private final LivePoolDao livePoolDao;
57 57 private final AccountDao accountDao;
58 58 private final Common common;
59   - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
  59 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
60 60 private final AccountService accountService;
61 61  
62 62 @Override
... ... @@ -206,7 +206,7 @@ public class BilibiliCrawl implements CrawlStrategy {
206 206 sb.setLength(0);
207 207 });
208 208 try {
209   - if (videoPoolList.size() > 0) {
  209 + if (!videoPoolList.isEmpty()) {
210 210 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
211 211 } else {
212 212 final VideoPool nullVideo = VideoPool.builder()
... ... @@ -394,7 +394,7 @@ public class BilibiliCrawl implements CrawlStrategy {
394 394 .build());
395 395 });
396 396 try {
397   - if (livePoolList.size() > 0) {
  397 + if (!livePoolList.isEmpty()) {
398 398 livePoolDao.saveAll(livePoolList.stream()
399 399 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
400 400 .collect(Collectors.toList())
... ...
src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java
... ... @@ -56,7 +56,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
56 56 private final LivePoolDao livePoolDao;
57 57 private final AccountDao accountDao;
58 58 private final Common common;
59   - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
  59 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
60 60 private final AccountService accountService;
61 61  
62 62 /**
... ... @@ -231,7 +231,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
231 231 .build());
232 232 });
233 233 try {
234   - if (videoPoolList.size() > 0) {
  234 + if (!videoPoolList.isEmpty()) {
235 235 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
236 236 } else {
237 237 final VideoPool nullVideo = VideoPool.builder()
... ... @@ -353,7 +353,7 @@ public class DongCheDiCrawl implements CrawlStrategy {
353 353 Date endTime = DateUtil.getThisDayMaxTime(previousDay);
354 354 Date startTime = DateUtil.getThisDayMinTime(previousDay);
355 355 try {
356   - if (livePoolList.size() > 0) {
  356 + if (!livePoolList.isEmpty()) {
357 357 livePoolDao.saveAll(livePoolList.stream()
358 358 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
359 359 .collect(Collectors.toList())
... ...
src/main/java/cn/fw/freya/service/crawl/impl/DouYinCrawl.java
... ... @@ -54,7 +54,7 @@ public class DouYinCrawl implements CrawlStrategy {
54 54 private final LiveOverviewDao liveDataDao;
55 55 private final LivePoolDao livePoolDao;
56 56 private final AccountDao accountDao;
57   - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
  57 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
58 58 private final Common common;
59 59 private final AccountService accountService;
60 60 private final int WAIT_SECONDS = 5;
... ... @@ -233,7 +233,7 @@ public class DouYinCrawl implements CrawlStrategy {
233 233 }
234 234 });
235 235 try {
236   - if (videoPoolList.size() > 0) {
  236 + if (!videoPoolList.isEmpty()) {
237 237 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
238 238 } else {
239 239 videoPoolDao.save(VideoPool.builder()
... ... @@ -389,7 +389,7 @@ public class DouYinCrawl implements CrawlStrategy {
389 389 throw new BusinessException("外部try-catch, DouyinCrawl->getAllVideoMsg()发生异常");
390 390 }
391 391 try {
392   - if (videoPoolList.size() > 0) {
  392 + if (!videoPoolList.isEmpty()) {
393 393 videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存
394 394 } else {
395 395 videoPoolDao.save(VideoPool.builder()
... ... @@ -608,7 +608,7 @@ public class DouYinCrawl implements CrawlStrategy {
608 608 Date endTime = DateUtil.getThisDayMaxTime(previousDay);
609 609 Date startTime = DateUtil.getThisDayMinTime(previousDay);
610 610 try {
611   - if (livePoolList.size() > 0) {
  611 + if (!livePoolList.isEmpty()) {
612 612 livePoolDao.saveAll(livePoolList.stream()
613 613 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
614 614 .collect(Collectors.toList())
... ...
src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java
... ... @@ -80,7 +80,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
80 80 private final LivePoolDao livePoolDao;
81 81 private final AccountDao accountDao;
82 82 private final Common common;
83   - public final static ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
  83 + public static final ConcurrentHashMap<String, WebDriver> DRIVER_MAP = new ConcurrentHashMap<>();
84 84 private final AccountService accountService;
85 85 private final String playbackBaseUrl = "https://live.kuaishou.com/playback/";
86 86 private final ConcurrentHashMap<String, String> sig3Map = new ConcurrentHashMap<>();
... ... @@ -327,10 +327,10 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
327 327 total = Objects.requireNonNull(obj).getInteger("totalCount");
328 328 videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray()));
329 329 page++;
330   - JSONArray photoList = obj.getJSONArray("photoList");
  330 + /*JSONArray photoList = obj.getJSONArray("photoList");
331 331 String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime");
332 332 if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0)
333   - break;
  333 + break;*/
334 334 } while (total > 10 * (page - 1));
335 335 /*HttpCookies cookies = HttpCookies.custom();
336 336 CookieStore cookieStore = new BasicCookieStore();
... ... @@ -581,7 +581,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
581 581 .build());
582 582 });
583 583 try {
584   - if (livePoolList.size() > 0) {
  584 + if (!livePoolList.isEmpty()) {
585 585 livePoolDao.saveAll(livePoolList.stream()
586 586 .filter(item -> item.getOpenTime().compareTo(startTime) >= 0 && item.getOpenTime().compareTo(endTime) <= 0)
587 587 .collect(Collectors.toList())
... ... @@ -850,21 +850,23 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
850 850 }
851 851 int maxPageNum = 0;
852 852 if (videoDimension) {
853   - List<WebElement> pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 ->
854   - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表
  853 + List<WebElement> pageLabels = this.findPageLabels(driver);// 获取分页页码标签元素列表
  854 + Map<Integer, WebElement> labelMap = this.processPageElement(pageLabels);// 将标签元素处理成K(页码), V(标签element)
855 855 if (!CollectionUtils.isEmpty(pageLabels)) {
856 856 maxPageNum = Integer.parseInt(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码
857   - for (WebElement item : pageLabels) {
  857 + for (int i = 0; i < maxPageNum; i++) {
858 858 try {
859   - /*if (Objects.equals(item.getText(), "...")) {
860   - pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 ->
861   - driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));
862   - item.click();
863   - }*/
864   - item.click();
  859 + WebElement pageLabel = this.findPageLabelFromMap(labelMap, i + 1);
  860 + if (Objects.isNull(pageLabel)) {
  861 + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(1));
  862 + pageLabels = this.findPageLabels(driver);
  863 + labelMap = this.processPageElement(pageLabels);
  864 + pageLabel = this.findPageLabelFromMap(labelMap, i + 1);
  865 + }
  866 + pageLabel.click();
865 867 LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2));
866 868 } catch (Exception e) {
867   - // this.exitBrowser(accountNo, uuid);
  869 + log.info("异常发生, 信息为: {}", e.getMessage(), e);
868 870 }
869 871 }
870 872 }
... ... @@ -888,8 +890,6 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
888 890 }
889 891 for (int i = 0; i < sigList.size(); i++) {
890 892 sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i));
891   - if (sigList.size() < maxPageNum && i > 4)
892   - break;
893 893 }
894 894 } catch (Exception e) {
895 895 this.exitBrowser(accountNo, uuid);
... ... @@ -900,6 +900,56 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle {
900 900 }
901 901  
902 902 /**
  903 + * 获取对应页码的WebElement对象
  904 + *
  905 + * @param labelMap K(页码), V(标签element)
  906 + * @param page 要获取的页码
  907 + * @return 页码对应的WebElement对象
  908 + */
  909 + private WebElement findPageLabelFromMap(Map<Integer, WebElement> labelMap, Integer page) {
  910 + WebElement webElement = null;
  911 + try {
  912 + webElement = labelMap.get(page);
  913 + } catch (Exception e) {
  914 + log.error(e.getMessage(), e);
  915 + }
  916 + if (Objects.nonNull(webElement))
  917 + return webElement;
  918 + return null;
  919 + }
  920 +
  921 + /**
  922 + * 寻找页面页码标签集合
  923 + *
  924 + * @param driver 驱动
  925 + * @return 页面页码标签元素
  926 + */
  927 + private List<WebElement> findPageLabels(WebDriver driver) {
  928 + return new WebDriverWait(driver, 15, 300).until(driver1 ->
  929 + driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));
  930 + }
  931 +
  932 + /**
  933 + * 将获取到的页码标签处理成 K(页码), V(标签element) 格式
  934 + *
  935 + * @param pageLabels 本次获取到的页码标签
  936 + * @return K(页码), V(标签element)
  937 + */
  938 + private Map<Integer, WebElement> processPageElement(List<WebElement> pageLabels) {
  939 + HashMap<Integer, WebElement> map = new HashMap<>();
  940 + pageLabels.forEach(item -> {
  941 + int pageNum;
  942 + try {
  943 + pageNum = Integer.parseInt(item.getText());
  944 + } catch (NumberFormatException e) {
  945 + pageNum = -1;
  946 + }
  947 + map.put(pageNum, item);
  948 + });
  949 + return map;
  950 + }
  951 +
  952 + /**
903 953 * 读取http日志获取数据接口全路径地址
904 954 *
905 955 * @param responseReceived 收到的响应
... ...