diff --git a/src/main/java/cn/fw/freya/FreyaApplication.java b/src/main/java/cn/fw/freya/FreyaApplication.java index 6b9a987..7d3b485 100644 --- a/src/main/java/cn/fw/freya/FreyaApplication.java +++ b/src/main/java/cn/fw/freya/FreyaApplication.java @@ -40,7 +40,7 @@ public class FreyaApplication { 6, 30, TimeUnit.SECONDS, - new LinkedBlockingQueue<>(1000), + new LinkedBlockingQueue<>(100000), new ThreadFactoryBuilder().setNamePrefix("wmy-thread-pool-").build(), new ThreadPoolExecutor.DiscardPolicy()); } diff --git a/src/main/java/cn/fw/freya/controller/KSController.java b/src/main/java/cn/fw/freya/controller/KSController.java index 0191221..bc54f1f 100644 --- a/src/main/java/cn/fw/freya/controller/KSController.java +++ b/src/main/java/cn/fw/freya/controller/KSController.java @@ -81,7 +81,7 @@ public class KSController { */ @GetMapping("/getNS_sig3") public String getNS_sig3(String accountNo, Integer type, boolean retryGet) { - return kuaishouCrawl.getNS_sig3(accountNo, type, retryGet); + return kuaishouCrawl.getNS_sig3(accountNo, type, 1, retryGet); } /** diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java index 91b9c35..f686c04 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/BilibiliCrawl.java @@ -254,7 +254,7 @@ public class BilibiliCrawl implements CrawlStrategy { ); String res = RequestUtil.get(config);// 发送GET请求 this.waitFor(0.5); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -288,7 +288,7 @@ public class BilibiliCrawl implements CrawlStrategy { ); String res = RequestUtil.get(config);// 发送GET请求 this.waitFor(0.1); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -333,7 +333,7 @@ public class BilibiliCrawl implements CrawlStrategy { ); String res = RequestUtil.get(config);// 发送GET请求 log.info(String.format("%s [%s]平台账户号为: %s的直播数据的原始数据为: %s", LocalDateTime.now(), this.getType().getName(), accountNo, res)); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -438,7 +438,7 @@ public class BilibiliCrawl implements CrawlStrategy { .build() ); String res = RequestUtil.get(config);// 发送GET请求 - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -534,7 +534,7 @@ public class BilibiliCrawl implements CrawlStrategy { .build() ); String res = RequestUtil.get(config); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/Common.java b/src/main/java/cn/fw/freya/service/crawl/impl/Common.java index 4bf5dc4..b31f18d 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/Common.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/Common.java @@ -215,7 +215,7 @@ public class Common { */ public HttpResponse getHttpResponse(WebDriver driver, ResponseReceived responseReceived, String dataUrl) { HttpResponse response = null; - String baseUrl = JSONObject.parseObject(responseReceived.getResponse()).getString("url"); + String baseUrl = JSON.parseObject(responseReceived.getResponse()).getString("url"); boolean notStaticFiles = !baseUrl.endsWith(".png") && !baseUrl.endsWith(".jpg") && !baseUrl.endsWith(".css") diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java index ab1ebf1..5c63728 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/DongCheDiCrawl.java @@ -152,7 +152,7 @@ public class DongCheDiCrawl implements CrawlStrategy { .build() ); String res = RequestUtil.get(config); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -283,7 +283,7 @@ public class DongCheDiCrawl implements CrawlStrategy { log.info(String.format("%s [%s]平台账户号为: %s的直播数据的原始数据为: %s", LocalDateTime.now(), this.getType().getName(), accountNo, res)); JSONObject response = new JSONObject(); try { - response = JSONObject.parseObject(res); + response = JSON.parseObject(res); } catch (Exception e) { log.info(LocalDate.now() + " 暂未找到账户号为:" + accountNo + "的懂车帝直播数据"); final LivePool nullLive = LivePool.builder() @@ -413,7 +413,7 @@ public class DongCheDiCrawl implements CrawlStrategy { .build() ); String res1 = RequestUtil.get(config1); - final JSONObject response1 = JSONObject.parseObject(res1); + final JSONObject response1 = JSON.parseObject(res1); if (this.verifyCookies(response1)) { return null; } @@ -480,7 +480,7 @@ public class DongCheDiCrawl implements CrawlStrategy { .build() ); String res = RequestUtil.get(config); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -520,7 +520,7 @@ public class DongCheDiCrawl implements CrawlStrategy { .build() ); String res = RequestUtil.get(config); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } diff --git a/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java b/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java index 975f127..26b45ee 100644 --- a/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java +++ b/src/main/java/cn/fw/freya/service/crawl/impl/KuaiShouCrawl.java @@ -102,7 +102,14 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { final List accountList = accountDao.getAllKSAccount(); accountList.forEach(item -> Arrays.stream(DataTypeEnum.values()).forEach(item1 -> - threadPoolExecutor.execute(() -> this.task(item.getAccountNo(), item1.getValue())) + threadPoolExecutor.execute(() -> { + Integer dataType = item1.getValue(); + if (Objects.equals(dataType, 2)) { + this.task(item.getAccountNo(), dataType, 1); + } else { + this.task(item.getAccountNo(), dataType, null); + } + }) ) ); return true; @@ -114,9 +121,11 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { * @param accountNo 账户号 * @param dataType 数据类型 */ - public void task(String accountNo, Integer dataType) { + public void task(String accountNo, Integer dataType, Integer page) { String key = accountNo + "#" + dataType; - final String ns_sig3 = this.getNS_sig3(accountNo, dataType, true); + if (Objects.nonNull(page)) + key += "#" + page; + String ns_sig3 = this.getNS_sig3(accountNo, dataType, page, true); if (Objects.nonNull(ns_sig3)) sig3Map.put(key, ns_sig3); else @@ -248,6 +257,53 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { } /** + * 分页获取快手视频数据 + * + * @param accountNo 账号 + * @param page 第几页 + * @return 快手返回data + * @throws IOException 异常 + */ + private JSONObject getVideoDataByPage(String accountNo, Integer page) throws IOException { + HttpCookies cookies = HttpCookies.custom(); + CookieStore cookieStore = new BasicCookieStore(); + cookies.setCookieStore(cookieStore); + final String ns_sig3 = this.getNS_sig3(accountNo, DataTypeEnum.VIDEO.getValue(), page, false); + if (Objects.isNull(ns_sig3)) + return null; + Map params = new LinkedHashMap<>(); + params.put("count", 10); + params.put("page", page); + //params.put("total", 9007199000000000L + new Random().nextInt(999999999)); + params.put("kuaishou.web.cp.api_ph", this.getWebApiPh(accountNo)); + HttpConfig config = HttpConfig.custom() + .url("https://cp.kuaishou.com/rest/cp/creator/pc/analysis/photo/list?__NS_sig3=" + ns_sig3) + .context(cookies.getContext()) + .json(JsonUtils.objectToJson(params)) + .headers(HttpHeader + .defaultHeader() + .contentType("application/json") + .host("cp.kuaishou.com") + .cookie(this.getUserCookies(accountNo)) + .build() + ); + String res = RequestUtil.post(config);// 发送POST请求 + log.info(String.format("%s [%s]平台账户号为: %s的视频数据的原始数据为: %s", LocalDateTime.now(), this.getType().getName(), accountNo, res)); + final JSONObject response = JSON.parseObject(res); + if (this.verifyCookies(response)) { + return null; + } + if (!StringUtils.hasText(res)) { + throw new BusinessException("调用快手[视频]接口失败"); + } + if (Objects.equals(response.getInteger("result"), 500002)) { + threadPoolExecutor.execute(() -> this.task(accountNo, DataTypeEnum.VIDEO.getValue(), page)); + throw new BusinessException("获取数据失败, 尝试重新获取sig3签名信息"); + } + return Optional.ofNullable(response.getJSONObject("data")).orElse(new JSONObject()); + } + + /** * 获取所有视频作品信息 * * @param accountNo 账户号 @@ -260,7 +316,21 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { if (Objects.nonNull(hasFoundVideo)) { return hasFoundVideo; } - HttpCookies cookies = HttpCookies.custom(); + Date previousDay = DateUtil.getPreviousDay(new Date()); + int page = 1; + Integer total; + JSONArray videoJsonArray = new JSONArray(); + do { + JSONObject obj = this.getVideoDataByPage(accountNo, page); + total = Objects.requireNonNull(obj).getInteger("totalCount"); + videoJsonArray.addAll(Optional.ofNullable(obj.getJSONArray("photoList")).orElse(new JSONArray())); + page++; + JSONArray photoList = obj.getJSONArray("photoList"); + String publishTime = JSON.parseObject(JSON.toJSONString(photoList.get(photoList.size() - 1))).getString("publishTime"); + if (Objects.requireNonNull(DateUtil.parse(publishTime)).compareTo(DateUtil.getMonthFirstDay(DateUtil.getThisDayMinTime(previousDay))) < 0) + break; + } while (total > 10 * (page - 1)); + /*HttpCookies cookies = HttpCookies.custom(); CookieStore cookieStore = new BasicCookieStore(); cookies.setCookieStore(cookieStore); Date previousDay = DateUtil.getPreviousDay(new Date()); @@ -285,7 +355,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { ); String res = RequestUtil.post(config);// 发送POST请求 log.info(String.format("%s [%s]平台账户号为: %s的视频数据的原始数据为: %s", LocalDateTime.now(), this.getType().getName(), accountNo, res)); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -297,6 +367,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { throw new BusinessException("获取数据失败, 尝试重新获取sig3签名信息"); } JSONArray videoJsonArray = Optional.ofNullable(Optional.ofNullable(response.getJSONObject("data")).orElse(new JSONObject()).getJSONArray("photoList")).orElse(new JSONArray()); + */ videoPoolDao.deleteByAccountNoAndDate(accountNo, previousDay, AccountTypeEnum.KS.getValue(), ResourceTypeEnum.VIDEO.getValue()); // 视频数据存库 List videoPoolList = new ArrayList<>(videoJsonArray.size()); @@ -354,7 +425,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { //} }); try { - if (videoPoolList.size() > 0) { + if (!videoPoolList.isEmpty()) { videoPoolDao.saveAll(videoPoolList);// 将收集到的视频信息保存 } else { final VideoPool nullVideo = VideoPool.builder() @@ -394,7 +465,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { Date endTime = DateUtil.getThisDayMaxTime(previousDay); Date startTime = DateUtil.getThisDayMinTime(previousDay); //Date startTime = DateUtil.getThisDayMinTime(new Date(previousDay.getTime() - 7 * 24 * 3600 * 1000L));// 补数据使用 - final String ns_sig3 = this.getNS_sig3(accountNo, DataTypeEnum.LIVE.getValue(), false); + final String ns_sig3 = this.getNS_sig3(accountNo, DataTypeEnum.LIVE.getValue(), null, false); if (Objects.isNull(ns_sig3)) return null; Map params = new LinkedHashMap<>(); @@ -418,7 +489,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { ); String res = RequestUtil.post(config);// 发送POST请求 log.info(String.format("%s [%s]平台账户号为: %s的直播数据的原始数据为: %s", LocalDateTime.now(), this.getType().getName(), accountNo, res)); - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -426,7 +497,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { throw new BusinessException("调用快手[视频]接口失败"); } if (Objects.equals(response.getInteger("result"), 500002)) { - threadPoolExecutor.execute(() -> this.task(accountNo, DataTypeEnum.LIVE.getValue())); + threadPoolExecutor.execute(() -> this.task(accountNo, DataTypeEnum.LIVE.getValue(), null)); throw new BusinessException("获取数据失败, 尝试重新获取sig3签名信息"); } JSONObject dataJSONObject = response.getJSONObject("data"); @@ -612,7 +683,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { if (!StringUtils.hasText(res)) { return objects; } - JSONObject resObj = JSONObject.parseObject(res); + JSONObject resObj = JSON.parseObject(res); try { return resObj.getJSONObject("data").getJSONObject("playbackFeeds").getJSONArray("list"); } catch (Exception e) { @@ -638,7 +709,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { cookies.setCookieStore(cookieStore); Map params = new HashMap<>(); params.put("kuaishou.web.cp.api_ph", this.getWebApiPh(accountNo)); - final String ns_sig3 = this.getNS_sig3(accountNo, DataTypeEnum.FANS.getValue(), false); + final String ns_sig3 = this.getNS_sig3(accountNo, DataTypeEnum.FANS.getValue(), null, false); if (Objects.isNull(ns_sig3)) return null; HttpConfig config = HttpConfig.custom() @@ -654,7 +725,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { .build() ); String res = RequestUtil.post(config);// 发送POST请求 - final JSONObject response = JSONObject.parseObject(res); + final JSONObject response = JSON.parseObject(res); if (this.verifyCookies(response)) { return null; } @@ -662,7 +733,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { throw new BusinessException("调用快手[直播]接口失败"); } if (Objects.equals(response.getInteger("result"), 500002)) { - threadPoolExecutor.execute(() -> this.task(accountNo, DataTypeEnum.FANS.getValue())); + threadPoolExecutor.execute(() -> this.task(accountNo, DataTypeEnum.FANS.getValue(), null)); throw new BusinessException("获取数据失败, 尝试重新获取sig3签名信息"); } JSONObject data = response.getJSONObject("data"); @@ -727,24 +798,29 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { * * @param accountNo 快手账户号 * @param type 密钥类型(1:粉丝, 2:短视频, 3:直播) + * @param page 数据页数(1:粉丝, 2:短视频, 3:直播) * @param retryGet 是否重新获取 */ - public String getNS_sig3(String accountNo, Integer type, boolean retryGet) { - final String key = accountNo + "#" + type; + public String getNS_sig3(String accountNo, Integer type, Integer page, boolean retryGet) { + String key = accountNo + "#" + type; String NS_sig3; if (!retryGet) { + if (Objects.nonNull(page)) + key += "#" + page; NS_sig3 = sig3Map.get(key); if (StringUtils.hasText(NS_sig3)) return NS_sig3; } + boolean videoDimension = Objects.equals(type, DataTypeEnum.VIDEO.getValue()); + boolean fansDimension = Objects.equals(type, DataTypeEnum.FANS.getValue()); final String uuid = UUID.randomUUID().toString().replace("-", ""); final WebDriver driver = this.getKSDriver(accountNo, uuid); String targetUrl = null; String dataUrl = null;// 数据接口地址 - if (Objects.equals(type, DataTypeEnum.FANS.getValue())) { + if (fansDimension) { targetUrl = "https://cp.kuaishou.com/profile"; dataUrl = "https://cp.kuaishou.com/rest/cp/creator/pc/home/infoV2"; - } else if (Objects.equals(type, DataTypeEnum.VIDEO.getValue())) { + } else if (videoDimension) { targetUrl = "https://cp.kuaishou.com/statistics/works"; dataUrl = "https://cp.kuaishou.com/rest/cp/creator/pc/analysis/photo/list"; } else if (Objects.equals(type, DataTypeEnum.LIVE.getValue())) { @@ -758,7 +834,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { throw new BusinessException("跳转页面发生异常"); } LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(5)); - if (Objects.equals(type, DataTypeEnum.FANS.getValue())) { + if (fansDimension) { driver.get("https://cp.kuaishou.com/article/manage/video"); LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(5)); if (Objects.equals(targetUrl, driver.getCurrentUrl())) { @@ -770,7 +846,29 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { this.exitBrowser(accountNo, uuid); return null; } + Integer maxPageNum = 0; + if (videoDimension) { + List pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> + driver1.findElements(By.xpath("//ul[@class='el-pager']/li")));// 获取分页页码标签元素列表 + maxPageNum = Integer.valueOf(pageLabels.get(pageLabels.size() - 1).getText());// 最大页码 + if (!CollectionUtils.isEmpty(pageLabels)) { + for (WebElement item : pageLabels) { + try { + /*if (Objects.equals(item.getText(), "...")) { + pageLabels = new WebDriverWait(driver, 15, 300).until(driver1 -> + driver1.findElements(By.xpath("//ul[@class='el-pager']/li"))); + item.click(); + }*/ + item.click(); + LockSupport.parkNanos(TimeUnit.SECONDS.toNanos(2)); + } catch (Exception e) { + // this.exitBrowser(accountNo, uuid); + } + } + } + } try { + ArrayList sigList = new ArrayList<>(); final List responseReceivedEvents = common.processHttpTransferData(driver); for (ResponseReceived item : responseReceivedEvents) { String str = this.getDataUrl(item, dataUrl); @@ -778,16 +876,25 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { this.exitBrowser(accountNo, uuid); String[] split = str.split("="); NS_sig3 = split[1]; - sig3Map.put(key, NS_sig3); - return NS_sig3; + if (videoDimension) { + sigList.add(NS_sig3); + } else { + sig3Map.put(key, NS_sig3); + return NS_sig3; + } } } + for (int i = 0; i < sigList.size(); i++) { + sig3Map.put(accountNo + "#" + type + "#" + (i + 1), sigList.get(i)); + if (sigList.size() < maxPageNum && i > 4) + break; + } } catch (Exception e) { this.exitBrowser(accountNo, uuid); throw new BusinessException(e.getMessage()); } this.exitBrowser(accountNo, uuid); - return null; + return sig3Map.get(key); } /** @@ -798,7 +905,7 @@ public class KuaiShouCrawl implements CrawlStrategy, SmartLifecycle { * @return */ public String getDataUrl(ResponseReceived responseReceived, String dataUrl) { - String baseUrl = JSONObject.parseObject(responseReceived.getResponse()).getString("url"); + String baseUrl = JSON.parseObject(responseReceived.getResponse()).getString("url"); boolean notStaticFiles = !baseUrl.endsWith(".png") && !baseUrl.endsWith(".jpg") && !baseUrl.endsWith(".css") diff --git a/src/main/java/cn/fw/freya/service/rpc/ReportRpcService.java b/src/main/java/cn/fw/freya/service/rpc/ReportRpcService.java index 3efe999..8c996ac 100644 --- a/src/main/java/cn/fw/freya/service/rpc/ReportRpcService.java +++ b/src/main/java/cn/fw/freya/service/rpc/ReportRpcService.java @@ -55,7 +55,7 @@ public class ReportRpcService { if (!StringUtils.hasText(res)) { return false; } - JSONObject resObj = JSONObject.parseObject(res); + JSONObject resObj = JSON.parseObject(res); Boolean result = resObj.getBoolean("success"); if (Boolean.FALSE.equals(result)) { Integer status = Optional.ofNullable(resObj.getInteger("status")).orElse(-1); @@ -112,7 +112,7 @@ public class ReportRpcService { if (!StringUtils.hasText(res)) { return false; } - JSONObject resObj = JSONObject.parseObject(res); + JSONObject resObj = JSON.parseObject(res); Boolean result = resObj.getBoolean("success"); if (Boolean.FALSE.equals(result)) { Integer status = Optional.ofNullable(resObj.getInteger("status")).orElse(-1); @@ -174,7 +174,7 @@ public class ReportRpcService { if (!StringUtils.hasText(res)) { return false; } - JSONObject resObj = JSONObject.parseObject(res); + JSONObject resObj = JSON.parseObject(res); Boolean result = resObj.getBoolean("success"); if (Boolean.FALSE.equals(result)) { Integer status = Optional.ofNullable(resObj.getInteger("status")).orElse(-1);