diff options
author | Mole Shang <[email protected]> | 2023-07-25 09:27:26 +0800 |
---|---|---|
committer | Mole Shang <[email protected]> | 2023-08-05 23:19:46 +0800 |
commit | ed8f6df90b0c39835198d5b7af4bbd391362f180 (patch) | |
tree | 907ba31bac854eb5dc8a2781825e24c049b10580 /src/extractors | |
download | hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.gz hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.bz2 hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.zip |
hinata: initial commit
Diffstat (limited to 'src/extractors')
-rw-r--r-- | src/extractors/bilibili.c | 475 | ||||
-rw-r--r-- | src/extractors/bilibili.h | 94 | ||||
-rw-r--r-- | src/extractors/extractor.c | 24 | ||||
-rw-r--r-- | src/extractors/extractor.h | 32 |
4 files changed, 625 insertions, 0 deletions
diff --git a/src/extractors/bilibili.c b/src/extractors/bilibili.c new file mode 100644 index 0000000..874a605 --- /dev/null +++ b/src/extractors/bilibili.c @@ -0,0 +1,475 @@ +#include <cjson/cJSON.h> +#include <limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifdef _WIN32 +#include "c11threads.h" +#else +#include <threads.h> +#endif + +#include "../logger.h" +#include "../process_url.h" +#include "../utils.h" +#include "bilibili.h" +#include "extractor.h" + +static int get_multipagedata(char *pagedata, Multipage *multipage_struct, + bool *is_page) { + const char *patterns_str[1] = {"window.__INITIAL_STATE__=(.+?);\\(function"}; + const str_array_t patterns = {(char **)patterns_str, 1}; + str_array_t results = create_str_array(0); + + int r = regex_match(pagedata, patterns, &results); + if (!r) { + for (unsigned short i = 0; i < results.n; i++) { + // DEBUG_PRINT("%s\n", results.str[i]); + if (results.str[i]) { + multipage_struct->json = cJSON_Parse(get_str_element(&results, i)); + } + } + free_str_array(&results); + + cJSON *aid_obj = cJSON_GetObjectItem(multipage_struct->json, "aid"); + cJSON *bvid_obj = cJSON_GetObjectItem(multipage_struct->json, "bvid"); + cJSON *sections_obj = + cJSON_GetObjectItem(multipage_struct->json, "sections"); + cJSON *videoData_obj = + cJSON_GetObjectItem(multipage_struct->json, "videoData"); + if (aid_obj && bvid_obj && sections_obj && videoData_obj) { + multipage_struct->aid = aid_obj->valueint; + multipage_struct->bvid = bvid_obj->valuestring; + multipage_struct->sections = create_array( + sizeof(Multi_episode_data), cJSON_GetArraySize(sections_obj)); + + /* sections */ + if (!cJSON_GetArraySize(sections_obj)) { + DEBUG_PRINT("This video does not have sections, meaning that it is a " + "multi-p video with only one av/bvid\n"); + *is_page = 1; + } + + cJSON *e; + int i = 0; + cJSON_ArrayForEach(e, sections_obj) { + Multi_episode_data *section = + get_element(&multipage_struct->sections, i); + cJSON *season_id_obj = cJSON_GetObjectItem(e, "season_id"); + cJSON *episodes_obj = cJSON_GetObjectItem(e, "episodes"); + if (season_id_obj && episodes_obj) { + section->season_id = season_id_obj->valueint; + DEBUG_PRINT("sections[%d] season_id: %d\n", i, section->season_id); + + section->episodes = + create_array(sizeof(Episode), cJSON_GetArraySize(episodes_obj)); + + cJSON *e; + int j = 0; + cJSON_ArrayForEach(e, episodes_obj) { + cJSON *aid_obj = cJSON_GetObjectItem(e, "aid"); + cJSON *bvid_obj = cJSON_GetObjectItem(e, "bvid"); + cJSON *cid_obj = cJSON_GetObjectItem(e, "cid"); + cJSON *title_obj = cJSON_GetObjectItem(e, "title"); + if (aid_obj && bvid_obj && cid_obj && title_obj) { + Episode *episode = get_element(§ion->episodes, j); + episode->aid = aid_obj->valueint; + episode->bvid = bvid_obj->valuestring; + episode->cid = cid_obj->valueint; + episode->title = title_obj->valuestring; + DEBUG_PRINT("sections[%d].episodes[%d] aid: %d\n", i, j, + episode->aid); + DEBUG_PRINT("sections[%d].episodes[%d] bvid: %s\n", i, j, + episode->bvid); + DEBUG_PRINT("sections[%d].episodes[%d] cid: %d\n", i, j, + episode->cid); + DEBUG_PRINT("sections[%d].episodes[%d] title: %s\n", i, j, + episode->title); + + j++; + continue; + } + r = 1; + LOG("cJSON", "Read JSON.sections[%d].episodes[%d] failed.\n", i, j); + return r; + } + + i++; + continue; + } + r = 1; + LOG("cJSON", "Read JSON.sections[%d] failed.\n", i); + return r; + } + + /* videoData */ + Multipage_video_data *videoData = &multipage_struct->videoData; + cJSON *title_obj = cJSON_GetObjectItem(videoData_obj, "title"); + cJSON *pages_obj = cJSON_GetObjectItem(videoData_obj, "pages"); + if (title_obj && pages_obj) { + videoData->title = title_obj->valuestring; + DEBUG_PRINT("videoData.title: %s\n", videoData->title); + + videoData->pages = create_array(sizeof(Video_pages_data), + cJSON_GetArraySize(pages_obj)); + int i = 0; + cJSON *e; + cJSON_ArrayForEach(e, pages_obj) { + cJSON *cid_obj = cJSON_GetObjectItem(e, "cid"); + cJSON *part_obj = cJSON_GetObjectItem(e, "part"); + cJSON *page_obj = cJSON_GetObjectItem(e, "page"); + if (cid_obj && part_obj && page_obj) { + Video_pages_data *page = get_element(&videoData->pages, i); + page->cid = cid_obj->valueint; + page->part = part_obj->valuestring; + page->page = page_obj->valueint; + DEBUG_PRINT("videoData.pages[%d].cid: %d\n", i, page->cid); + DEBUG_PRINT("videoData.pages[%d].part: %s\n", i, page->part); + DEBUG_PRINT("videoData.pages[%d].page: %d\n", i, page->page); + + i++; + continue; + } + LOG("cJSON", "Read JSON.videodata.pages[%d] failed.\n", i); + return 1; + } + } else { + LOG("cJSON", "Read JSON.videodata failed.\n"); + return 1; + } + } else { + r = 1; + LOG("cJSON", "Parse pagedata JSON failed.\n"); + } + } + return r; +} + +static int get_dash(const char *api_resp, Dash *dash) { + dash->json = cJSON_Parse(api_resp); + + cJSON *code_obj = cJSON_GetObjectItem(dash->json, "code"); + cJSON *message_obj = cJSON_GetObjectItem(dash->json, "message"); + cJSON *dashinfo_obj = cJSON_GetObjectItem(dash->json, "data"); + if (cJSON_IsInvalid(dashinfo_obj)) { + dashinfo_obj = cJSON_GetObjectItem(dash->json, "result"); + } + + if (!code_obj || !message_obj || !dashinfo_obj) { + LOG("cJSON", "Parse API resp_json failed.\n"); + return 1; + } + dash->code = code_obj->valueint; + dash->message = code_obj->valuestring; + + /* dashinfo: "data" or "result" */ + DEBUG_PRINT("Key of dashinfo: %s\n", dashinfo_obj->string); + Dash_info *dashinfo = &dash->dashinfo; + cJSON *quality_obj = cJSON_GetObjectItem(dashinfo_obj, "quality"); + cJSON *accept_description_obj = + cJSON_GetObjectItem(dashinfo_obj, "accept_description"); + cJSON *accept_quality_obj = + cJSON_GetObjectItem(dashinfo_obj, "accept_quality"); + cJSON *dash_streams_obj = cJSON_GetObjectItem(dashinfo_obj, "dash"); + cJSON *format_obj = cJSON_GetObjectItem(dashinfo_obj, "format"); + cJSON *durl_obj = cJSON_GetObjectItem(dashinfo_obj, "durl"); // NOTE: Optional + + if (!quality_obj || !accept_description_obj || !accept_quality_obj || + !dash_streams_obj || !format_obj) { + LOG("cJSON", "Read API resp_json.%s failed.\n", dashinfo_obj->string); + return 1; + } + + dashinfo->quality = quality_obj->valueint; + DEBUG_PRINT("quality: %d\n", dashinfo->quality); + + dashinfo->format = format_obj->valuestring; + DEBUG_PRINT("format: %s\n", dashinfo->format); + + dashinfo->accept_description = + create_str_array(cJSON_GetArraySize(accept_quality_obj)); + str_array_t *ac_d = &dashinfo->accept_description; + for (unsigned char n = 0; n < cJSON_GetArraySize(accept_description_obj); + n++) { + cJSON *i = cJSON_GetArrayItem(accept_description_obj, n); + + if (!i) { + LOG("cJSON", "Read API resp_json.%s.accept_description failed.\n", + dashinfo_obj->string); + return 1; + } + set_str_element(ac_d, n, i->valuestring); + DEBUG_PRINT("accept_description[%hhu]: %s\n", n, get_str_element(ac_d, n)); + } + + dashinfo->accept_quality = + create_array(sizeof(int), cJSON_GetArraySize(accept_quality_obj)); + generic_array_t *ac_q = &dashinfo->accept_quality; + for (unsigned char n = 0; n < cJSON_GetArraySize(accept_quality_obj); n++) { + cJSON *i = cJSON_GetArrayItem(accept_quality_obj, n); + + if (!i) { + LOG("cJSON", "Read API resp_json.%s.accept_quality failed.\n", + dashinfo_obj->string); + return 1; + } + int *v = get_element(ac_q, n); + *v = i->valueint; + DEBUG_PRINT("accept_quality[%hhu]: %d\n", n, *v); + } + + cJSON *video_obj = cJSON_GetObjectItem(dash_streams_obj, "video"); + cJSON *audio_obj = cJSON_GetObjectItem(dash_streams_obj, "audio"); + + if (!video_obj || !audio_obj) { + LOG("cJSON", "Read API resp_json.%s.dash failed.\n", dashinfo_obj->string); + return 1; + } + + dashinfo->dash.video = + create_array(sizeof(Dash_stream), cJSON_GetArraySize(video_obj)); + dashinfo->dash.audio = + create_array(sizeof(Dash_stream), cJSON_GetArraySize(audio_obj)); + generic_array_t *target; + cJSON *dash_stream_obj; + for (dash_stream_obj = video_obj, target = &dashinfo->dash.video;;) { + int i = 0; + cJSON *e; + cJSON_ArrayForEach(e, dash_stream_obj) { + cJSON *id_obj = cJSON_GetObjectItem(e, "id"); + cJSON *baseUrl_obj = cJSON_GetObjectItem(e, "baseUrl"); + cJSON *bandwidth_obj = cJSON_GetObjectItem(e, "bandwidth"); + cJSON *mimeType_obj = cJSON_GetObjectItem(e, "mimeType"); + cJSON *codecid_obj = cJSON_GetObjectItem(e, "codecid"); + cJSON *codecs_obj = cJSON_GetObjectItem(e, "codecs"); + + if (!id_obj || !baseUrl_obj || !bandwidth_obj || !mimeType_obj || + !codecid_obj || !codecs_obj) { + LOG("cJSON", "Read API resp_json.%s.dash.%s[%d] failed.\n", + dashinfo_obj->string, dash_stream_obj->string, i); + return 1; + } + Dash_stream *ds = get_element(target, i); + ds->id = id_obj->valueint; + ds->baseUrl = baseUrl_obj->valuestring; + ds->bandwidth = bandwidth_obj->valueint; + ds->mimeType = mimeType_obj->valuestring; + ds->codecid = codecid_obj->valueint; + ds->codecs = codecs_obj->valuestring; + + DEBUG_PRINT("%s[%d].id: %d\n", dash_stream_obj->string, i, ds->id); + DEBUG_PRINT("%s[%d].baseUrl: %s\n", dash_stream_obj->string, i, + ds->baseUrl); + DEBUG_PRINT("%s[%d].bandwidth: %d\n", dash_stream_obj->string, i, + ds->bandwidth); + DEBUG_PRINT("%s[%d].mimeType: %s\n", dash_stream_obj->string, i, + ds->mimeType); + DEBUG_PRINT("%s[%d].codecid: %d\n", dash_stream_obj->string, i, + ds->codecid); + DEBUG_PRINT("%s[%d].codecs: %s\n", dash_stream_obj->string, i, + ds->codecs); + + i++; + } + + if (dash_stream_obj == video_obj) { + dash_stream_obj = audio_obj; + target = &dashinfo->dash.audio; + } else { + break; + } + } + + return 0; +} + +static int get_page_in_query(char *query, int *page) { + const char *pattern = "p=(\\d+)"; + str_array_t results = {0}; + int r = regex_match(query, (str_array_t){(char **)&pattern, 1}, &results); + if (!r) { + // for (unsigned short i = 0; i < results.n; i++) { + // DEBUG_PRINT("%s\n", results.str[i]); + // } + *page = results.n ? atoi(results.str[0]) : 1; // Download p1 by default + } + return r; +} + +static int generate_api(Bilibili_options *bilibili_options, const int quality) { + char params[UCHAR_MAX]; + snprintf(params, sizeof(params), + "avid=%d&cid=%d&bvid=%s&qn=%d&type=&otype=json&fourk=1&fnver=0&" + "fnval=2000", + bilibili_options->aid, bilibili_options->cid, bilibili_options->bvid, + quality); + bilibili_options->api = malloc(strlen(BILIBILI_API) + strlen(params) + 1); + strcpy(bilibili_options->api, BILIBILI_API); + strcat(bilibili_options->api, params); + return 0; +} + +static const char *mimeType2ext(const char *mimeType) { + static char mimeType_l[CHAR_MAX]; + strcpy(mimeType_l, mimeType); + const char *exts[2]; + size_t extsCount = 0; + + char *token = strtok(mimeType_l, "/"); + while (token != NULL && extsCount < 2) { + exts[extsCount++] = token; + token = strtok(NULL, "/"); + } + + if (extsCount == 2) { + return exts[1]; + } + + return "mp4"; // Cannot parse, use default +} + +static const char *id2quality_desc(int id) { + const char *desc; + switch (id) { + case 127: + desc = "超高清 8K"; + break; + case 120: + desc = "超清 4K"; + break; + case 112: + desc = "高清 1080P+"; + break; + case 80: + desc = "高清 1080P"; + break; + case 48: + desc = "高清 720P"; + break; + case 32: + desc = "清晰 480P"; + break; + case 16: + desc = "流畅 360P"; + break; + default: + desc = "Unknown resolution"; + break; + } + return desc; +} + +static void multipage_cleanup(Multipage *multipage_struct) { + for (unsigned short i = 0; i < multipage_struct->sections.n; i++) { + // free_and_nullify(multipage_struct->sections[i].episodes); + Multi_episode_data *section = get_element(&multipage_struct->sections, i); + free_array(§ion->episodes); + } + free_array(&multipage_struct->sections); + free_array(&multipage_struct->videoData.pages); + cJSON_Delete(multipage_struct->json); + multipage_struct->json = NULL; +} + +static void dash_cleanup(Dash *dash) { + cJSON_Delete(dash->json); + free_str_array(&dash->dashinfo.accept_description); + free_array(&dash->dashinfo.accept_quality); + free_array(&dash->dashinfo.dash.audio); + free_array(&dash->dashinfo.dash.video); +} + +static int download(Bilibili_options *bilibili_options) { + Dash dash = {0}; + char *resp; + get(bilibili_options->api, &resp); + if (get_dash(resp, &dash)) { + LOG("Bilibili", "Get dash failed."); + free_and_nullify(resp); + dash_cleanup(&dash); + return 1; + }; + + // Download the highest resolution + Dash_stream *video = get_element(&dash.dashinfo.dash.video, 0); + Dash_stream *audio = get_element(&dash.dashinfo.dash.audio, 0); + const char *quality_desc = id2quality_desc(video->id); + + { + char fn[USHRT_MAX]; + sprintf(fn, "%s[%s]-%s.%s", bilibili_options->title, quality_desc, "video", + mimeType2ext(video->mimeType)); + add_url(video->baseUrl, NULL, fn, "https://www.bilibili.com"); + } + + { + char fn[USHRT_MAX]; + sprintf(fn, "%s[%s]-%s.%s", bilibili_options->title, + quality_desc, "audio", mimeType2ext(audio->mimeType)); + add_url(audio->baseUrl, NULL, fn, "https://www.bilibili.com"); + } + + free_and_nullify(resp); + dash_cleanup(&dash); + return 0; +} + +void bilibili_extract(struct options *options) { + Multipage multipage_struct = {0}; + Bilibili_options bilibili_options = {options->URL}; + int p = 1; + char *api; + + if (get(options->URL, &options->pagedata)) { + append_log("[Bilibili] Download pagedata failed.\n"); + return; + } + bilibili_options.html = options->pagedata; + + if (get_multipagedata(options->pagedata, &multipage_struct, + &bilibili_options.is_page)) { + multipage_cleanup(&multipage_struct); + append_log("[Bilibili] Parse pagedata failed.\n"); + return; + }; + + if (get_page_in_query(options->query, &p) || p < 1 || + p > multipage_struct.videoData.pages.n) { + multipage_cleanup(&multipage_struct); + append_log("[Bilibili] Parse query failed.\n"); + return; + } + + Video_pages_data *page = + get_element(&multipage_struct.videoData.pages, p - 1); + + bilibili_options.aid = multipage_struct.aid; + bilibili_options.bvid = multipage_struct.bvid; + bilibili_options.cid = page->cid; + bilibili_options.page = p; + bilibili_options.title = multipage_struct.videoData.title; + + DEBUG_PRINT("aid: %d\n", bilibili_options.aid); + DEBUG_PRINT("bvid: %s\n", bilibili_options.bvid); + DEBUG_PRINT("cid: %d\n", bilibili_options.cid); + DEBUG_PRINT("is_page: %s\n", bilibili_options.is_page ? "yes" : "no"); + DEBUG_PRINT("page: %d\n", bilibili_options.page); + DEBUG_PRINT("title: %s\n", bilibili_options.title); + + if (generate_api(&bilibili_options, 127)) { + free_and_nullify(bilibili_options.api); + multipage_cleanup(&multipage_struct); + return; + } + DEBUG_PRINT("Generated API: %s\n", bilibili_options.api); + + if (download(&bilibili_options)) { + free_and_nullify(bilibili_options.api); + multipage_cleanup(&multipage_struct); + return; + } + + free_and_nullify(bilibili_options.api); + multipage_cleanup(&multipage_struct); +} diff --git a/src/extractors/bilibili.h b/src/extractors/bilibili.h new file mode 100644 index 0000000..5d609b0 --- /dev/null +++ b/src/extractors/bilibili.h @@ -0,0 +1,94 @@ +#ifndef BILIBILI_H_ +#define BILIBILI_H_ + +#include "../utils.h" +#include "extractor.h" +#include <stddef.h> + +#define BILIBILI_API "https://api.bilibili.com/x/player/playurl?" +#define BILIBILI_BANGUMI_API "https://api.bilibili.com/pgc/player/web/playurl?" +#define BILIBILI_TOKEN_API "https://api.bilibili.com/x/player/playurl/token?" + +typedef struct video_pages_data { + int cid; + char *part; + int page; +} Video_pages_data; + +typedef struct multipage_video_data { + char *title; + generic_array_t pages; +} Multipage_video_data; + +typedef struct episode { + int aid; + char *bvid; + int cid; + char *title; +} Episode; + +typedef struct multi_episode_data { + int season_id; + generic_array_t episodes; +} Multi_episode_data; + +typedef struct multipage { + int aid; + char *bvid; + generic_array_t sections; + Multipage_video_data videoData; + cJSON *json; +} Multipage; + +typedef struct bilibili_options { + char *url; + char *html; + char *api; + char *cookie; + bool is_bangumi; + bool is_page; + int aid; + int cid; + char *bvid; + int page; + char *title; +} Bilibili_options; + +typedef struct durl { + char *url; + size_t size; +} Durl; + +typedef struct dash_stream { + int id; + char *baseUrl; + int bandwidth; + char *mimeType; + int codecid; + char *codecs; +} Dash_stream; + +typedef struct dash_streams { + generic_array_t video; + generic_array_t audio; +} Dash_streams; + +typedef struct dash_info { + int quality; + str_array_t accept_description; + generic_array_t accept_quality; + Dash_streams dash; + char *format; + generic_array_t durl; +} Dash_info; + +typedef struct dash { + int code; + char *message; + Dash_info dashinfo; + cJSON *json; +} Dash; + +void bilibili_extract(struct options *); + +#endif diff --git a/src/extractors/extractor.c b/src/extractors/extractor.c new file mode 100644 index 0000000..91f34d2 --- /dev/null +++ b/src/extractors/extractor.c @@ -0,0 +1,24 @@ +#include <stdlib.h> + +#include "bilibili.h" +#include "extractor.h" + +Site_map site_map = {{{"www.bilibili.com", SITE_BILIBILI}}, 1}; + +void options_cleanup(Options *options) { + free_and_nullify(options->URL); + free_and_nullify(options->path); + free_and_nullify(options->query); + free_and_nullify(options->pagedata); +} + +int extract(void *v) { + Options *options = (Options *)v; + switch (options->site) { + case SITE_BILIBILI: + bilibili_extract(options); + break; + } + options_cleanup(options); + return 0; +} diff --git a/src/extractors/extractor.h b/src/extractors/extractor.h new file mode 100644 index 0000000..d3ebeec --- /dev/null +++ b/src/extractors/extractor.h @@ -0,0 +1,32 @@ +#ifndef EXTRACTOR_H_ +#define EXTRACTOR_H_ + +#include <cjson/cJSON.h> +#include <limits.h> +#include <stdbool.h> +#include <stddef.h> + +enum site { SITE_BILIBILI }; +typedef enum site site_t; + +typedef struct site_map { + struct { + char domain[SHRT_MAX]; + site_t site; + } pairs[1]; + unsigned char size; +} Site_map; + +typedef struct options { + site_t site; + char *URL; + char *path; + char *query; + char *pagedata; +} Options; + +void options_cleanup(Options*); + +int extract(void *); + +#endif |