feat: 更新引入案例相关源码
This commit is contained in:
171
code/02/bilibili.py
Normal file
171
code/02/bilibili.py
Normal file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding:utf-8
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import math
|
||||
import pathlib
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List, TypeVar
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import requests_html
|
||||
|
||||
# --------------------
|
||||
# Prerequisites
|
||||
# --------------------
|
||||
|
||||
# internal types
|
||||
_T = TypeVar("_T")
|
||||
APIData = Dict[str, _T]
|
||||
Records = List[APIData]
|
||||
|
||||
# api or data url
|
||||
VIDEO_API_URL = "https://api.bilibili.com/x/space/arc/search"
|
||||
VIDEO_PAGE_URL = "https://www.bilibili.com/video/{bvid}"
|
||||
|
||||
# request header
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)",
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko)",
|
||||
"Chrome/81.0.4044.92",
|
||||
"Safari/537.36",
|
||||
"Edg/81.0.416.53",
|
||||
]
|
||||
HEADERS = {"User-Agent": " ".join(USER_AGENTS)}
|
||||
|
||||
MID_NUMBER = "533459953"
|
||||
params = {"ps": "30", "tid": "0", "order": "pubdate"}
|
||||
|
||||
|
||||
# logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[{asctime}]-[{levelname:<8}]-[{funcName}:{lineno}] - {message}",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
style="{",
|
||||
)
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# -----------------------
|
||||
# Query and Handle Method
|
||||
# -----------------------
|
||||
|
||||
|
||||
def fetch_page_number(mid: str) -> int:
|
||||
"""fetch total page number from API at first time query."""
|
||||
|
||||
total = 0
|
||||
payloads = {"mid": mid, "pn": 1, **params}
|
||||
with requests.Session() as sess:
|
||||
response = sess.get(
|
||||
url=VIDEO_API_URL,
|
||||
headers=HEADERS,
|
||||
params=payloads,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
count = response.json()["data"]["page"]["count"]
|
||||
total += math.ceil(int(count) / 30)
|
||||
|
||||
return total
|
||||
|
||||
|
||||
def fetch_video_data(mid: str, page: int) -> List[APIData]:
|
||||
"""fetch video data from API."""
|
||||
|
||||
payload = {"mid": mid, "pn": str(page), **params}
|
||||
with requests.Session() as sess:
|
||||
response = sess.get(
|
||||
url=VIDEO_API_URL,
|
||||
headers=HEADERS,
|
||||
params=payload,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
jsons = response.json()["data"]["list"]["vlist"]
|
||||
log.info(f"fetch video from '{mid}' at {page} page.")
|
||||
return jsons
|
||||
|
||||
|
||||
async def fetch_stats(bvid: str, asess) -> APIData:
|
||||
"""fetch like, coin, collect and share from video page."""
|
||||
|
||||
info = {}
|
||||
stats = ["rank", "like", "coin", "collect", "share"]
|
||||
response = await asess.get(
|
||||
url=VIDEO_PAGE_URL.format(bvid=bvid),
|
||||
headers=HEADERS,
|
||||
)
|
||||
response.raise_for_status()
|
||||
html = response.html
|
||||
|
||||
has_rank = html.find(".video-data .rank", first=True)
|
||||
if has_rank:
|
||||
info["rank"] = has_rank.text.strip()
|
||||
|
||||
try:
|
||||
info["like"] = html.find(".ops .like", first=True).text.strip()
|
||||
info["coin"] = html.find(".ops .coin", first=True).text.strip()
|
||||
info["collect"] = html.find(".ops .collect", first=True).text.strip()
|
||||
info["share"] = html.find(".ops .share", first=True).text.strip()
|
||||
except AttributeError:
|
||||
log.warning(f"cant' get stats from '{bvid}', use default.")
|
||||
return {k: "" for k in stats}
|
||||
|
||||
log.info(f"fetch stats from '{bvid}'.")
|
||||
return info
|
||||
|
||||
|
||||
async def bundle(json, asess) -> APIData:
|
||||
"""bundle json data with stats."""
|
||||
|
||||
bvid = json["bvid"]
|
||||
stats = await fetch_stats(bvid, asess)
|
||||
info = {**json, **stats}
|
||||
return info
|
||||
|
||||
|
||||
def query(mid: str) -> Records:
|
||||
"""query data by mid number."""
|
||||
|
||||
log.info(f"querying data from '{mid}'...")
|
||||
|
||||
total_page = fetch_page_number(mid)
|
||||
with ThreadPoolExecutor(max_workers=2) as p:
|
||||
features = [
|
||||
p.submit(fetch_video_data, mid=mid, page=page)
|
||||
for page in range(1, total_page + 1)
|
||||
]
|
||||
jsons = itertools.chain(*[f.result() for f in features])
|
||||
|
||||
# async session for html request
|
||||
asess = requests_html.AsyncHTMLSession(workers=2)
|
||||
|
||||
# compatible with requests-html async coroutine codes
|
||||
# see: https://github.com/psf/requests-html/issues/362
|
||||
results = asess.run(
|
||||
*[lambda json=json, asess=asess: bundle(json, asess) for json in jsons]
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse(jsons: Records) -> pd.DataFrame:
|
||||
"""normalize and combine json data."""
|
||||
|
||||
return pd.json_normalize(jsons)
|
||||
|
||||
|
||||
def main():
|
||||
csvfile = pathlib.Path("~/Desktop/bilibili.csv").expanduser()
|
||||
|
||||
jsons = query(MID_NUMBER)
|
||||
data = parse(jsons)
|
||||
data.to_csv(csvfile, index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
25
code/02/fetchVideoStat.js
Normal file
25
code/02/fetchVideoStat.js
Normal file
@@ -0,0 +1,25 @@
|
||||
(function () {
|
||||
let api = window.__INITIAL_STATE__;
|
||||
let stat = undefined;
|
||||
try {
|
||||
stat = api.videoData.stat;
|
||||
} catch (e) {}
|
||||
|
||||
const cols = ["rank", "like", "coin", "collect", "share"];
|
||||
|
||||
let blankStat = {};
|
||||
for (const col of cols) {
|
||||
blankStat[col] = "";
|
||||
}
|
||||
|
||||
if (stat === undefined) {
|
||||
return blankStat;
|
||||
}
|
||||
return {
|
||||
rank: stat.his_rank,
|
||||
like: stat.like,
|
||||
coin: stat.coin,
|
||||
collect: stat.favorite,
|
||||
share: stat.share,
|
||||
};
|
||||
})();
|
||||
1414
code/02/main.ipynb
Normal file
1414
code/02/main.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user