path: root/src/extractors
diff options
authorMole Shang <[email protected]>2023-07-25 09:27:26 +0800
committerMole Shang <[email protected]>2023-08-05 23:19:46 +0800
commited8f6df90b0c39835198d5b7af4bbd391362f180 (patch)
tree907ba31bac854eb5dc8a2781825e24c049b10580 /src/extractors
hinata: initial commit
Diffstat (limited to 'src/extractors')
4 files changed, 625 insertions, 0 deletions
diff --git a/src/extractors/bilibili.c b/src/extractors/bilibili.c
new file mode 100644
index 0000000..874a605
--- /dev/null
+++ b/src/extractors/bilibili.c
@@ -0,0 +1,475 @@
+#include <cjson/cJSON.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include "c11threads.h"
+#include <threads.h>
+#include "../logger.h"
+#include "../process_url.h"
+#include "../utils.h"
+#include "bilibili.h"
+#include "extractor.h"
+static int get_multipagedata(char *pagedata, Multipage *multipage_struct,
+ bool *is_page) {
+ const char *patterns_str[1] = {"window.__INITIAL_STATE__=(.+?);\\(function"};
+ const str_array_t patterns = {(char **)patterns_str, 1};
+ str_array_t results = create_str_array(0);
+ int r = regex_match(pagedata, patterns, &results);
+ if (!r) {
+ for (unsigned short i = 0; i < results.n; i++) {
+ // DEBUG_PRINT("%s\n", results.str[i]);
+ if (results.str[i]) {
+ multipage_struct->json = cJSON_Parse(get_str_element(&results, i));
+ }
+ }
+ free_str_array(&results);
+ cJSON *aid_obj = cJSON_GetObjectItem(multipage_struct->json, "aid");
+ cJSON *bvid_obj = cJSON_GetObjectItem(multipage_struct->json, "bvid");
+ cJSON *sections_obj =
+ cJSON_GetObjectItem(multipage_struct->json, "sections");
+ cJSON *videoData_obj =
+ cJSON_GetObjectItem(multipage_struct->json, "videoData");
+ if (aid_obj && bvid_obj && sections_obj && videoData_obj) {
+ multipage_struct->aid = aid_obj->valueint;
+ multipage_struct->bvid = bvid_obj->valuestring;
+ multipage_struct->sections = create_array(
+ sizeof(Multi_episode_data), cJSON_GetArraySize(sections_obj));
+ /* sections */
+ if (!cJSON_GetArraySize(sections_obj)) {
+ DEBUG_PRINT("This video does not have sections, meaning that it is a "
+ "multi-p video with only one av/bvid\n");
+ *is_page = 1;
+ }
+ cJSON *e;
+ int i = 0;
+ cJSON_ArrayForEach(e, sections_obj) {
+ Multi_episode_data *section =
+ get_element(&multipage_struct->sections, i);
+ cJSON *season_id_obj = cJSON_GetObjectItem(e, "season_id");
+ cJSON *episodes_obj = cJSON_GetObjectItem(e, "episodes");
+ if (season_id_obj && episodes_obj) {
+ section->season_id = season_id_obj->valueint;
+ DEBUG_PRINT("sections[%d] season_id: %d\n", i, section->season_id);
+ section->episodes =
+ create_array(sizeof(Episode), cJSON_GetArraySize(episodes_obj));
+ cJSON *e;
+ int j = 0;
+ cJSON_ArrayForEach(e, episodes_obj) {
+ cJSON *aid_obj = cJSON_GetObjectItem(e, "aid");
+ cJSON *bvid_obj = cJSON_GetObjectItem(e, "bvid");
+ cJSON *cid_obj = cJSON_GetObjectItem(e, "cid");
+ cJSON *title_obj = cJSON_GetObjectItem(e, "title");
+ if (aid_obj && bvid_obj && cid_obj && title_obj) {
+ Episode *episode = get_element(&section->episodes, j);
+ episode->aid = aid_obj->valueint;
+ episode->bvid = bvid_obj->valuestring;
+ episode->cid = cid_obj->valueint;
+ episode->title = title_obj->valuestring;
+ DEBUG_PRINT("sections[%d].episodes[%d] aid: %d\n", i, j,
+ episode->aid);
+ DEBUG_PRINT("sections[%d].episodes[%d] bvid: %s\n", i, j,
+ episode->bvid);
+ DEBUG_PRINT("sections[%d].episodes[%d] cid: %d\n", i, j,
+ episode->cid);
+ DEBUG_PRINT("sections[%d].episodes[%d] title: %s\n", i, j,
+ episode->title);
+ j++;
+ continue;
+ }
+ r = 1;
+ LOG("cJSON", "Read JSON.sections[%d].episodes[%d] failed.\n", i, j);
+ return r;
+ }
+ i++;
+ continue;
+ }
+ r = 1;
+ LOG("cJSON", "Read JSON.sections[%d] failed.\n", i);
+ return r;
+ }
+ /* videoData */
+ Multipage_video_data *videoData = &multipage_struct->videoData;
+ cJSON *title_obj = cJSON_GetObjectItem(videoData_obj, "title");
+ cJSON *pages_obj = cJSON_GetObjectItem(videoData_obj, "pages");
+ if (title_obj && pages_obj) {
+ videoData->title = title_obj->valuestring;
+ DEBUG_PRINT("videoData.title: %s\n", videoData->title);
+ videoData->pages = create_array(sizeof(Video_pages_data),
+ cJSON_GetArraySize(pages_obj));
+ int i = 0;
+ cJSON *e;
+ cJSON_ArrayForEach(e, pages_obj) {
+ cJSON *cid_obj = cJSON_GetObjectItem(e, "cid");
+ cJSON *part_obj = cJSON_GetObjectItem(e, "part");
+ cJSON *page_obj = cJSON_GetObjectItem(e, "page");
+ if (cid_obj && part_obj && page_obj) {
+ Video_pages_data *page = get_element(&videoData->pages, i);
+ page->cid = cid_obj->valueint;
+ page->part = part_obj->valuestring;
+ page->page = page_obj->valueint;
+ DEBUG_PRINT("videoData.pages[%d].cid: %d\n", i, page->cid);
+ DEBUG_PRINT("videoData.pages[%d].part: %s\n", i, page->part);
+ DEBUG_PRINT("videoData.pages[%d].page: %d\n", i, page->page);
+ i++;
+ continue;
+ }
+ LOG("cJSON", "Read JSON.videodata.pages[%d] failed.\n", i);
+ return 1;
+ }
+ } else {
+ LOG("cJSON", "Read JSON.videodata failed.\n");
+ return 1;
+ }
+ } else {
+ r = 1;
+ LOG("cJSON", "Parse pagedata JSON failed.\n");
+ }
+ }
+ return r;
+static int get_dash(const char *api_resp, Dash *dash) {
+ dash->json = cJSON_Parse(api_resp);
+ cJSON *code_obj = cJSON_GetObjectItem(dash->json, "code");
+ cJSON *message_obj = cJSON_GetObjectItem(dash->json, "message");
+ cJSON *dashinfo_obj = cJSON_GetObjectItem(dash->json, "data");
+ if (cJSON_IsInvalid(dashinfo_obj)) {
+ dashinfo_obj = cJSON_GetObjectItem(dash->json, "result");
+ }
+ if (!code_obj || !message_obj || !dashinfo_obj) {
+ LOG("cJSON", "Parse API resp_json failed.\n");
+ return 1;
+ }
+ dash->code = code_obj->valueint;
+ dash->message = code_obj->valuestring;
+ /* dashinfo: "data" or "result" */
+ DEBUG_PRINT("Key of dashinfo: %s\n", dashinfo_obj->string);
+ Dash_info *dashinfo = &dash->dashinfo;
+ cJSON *quality_obj = cJSON_GetObjectItem(dashinfo_obj, "quality");
+ cJSON *accept_description_obj =
+ cJSON_GetObjectItem(dashinfo_obj, "accept_description");
+ cJSON *accept_quality_obj =
+ cJSON_GetObjectItem(dashinfo_obj, "accept_quality");
+ cJSON *dash_streams_obj = cJSON_GetObjectItem(dashinfo_obj, "dash");
+ cJSON *format_obj = cJSON_GetObjectItem(dashinfo_obj, "format");
+ cJSON *durl_obj = cJSON_GetObjectItem(dashinfo_obj, "durl"); // NOTE: Optional
+ if (!quality_obj || !accept_description_obj || !accept_quality_obj ||
+ !dash_streams_obj || !format_obj) {
+ LOG("cJSON", "Read API resp_json.%s failed.\n", dashinfo_obj->string);
+ return 1;
+ }
+ dashinfo->quality = quality_obj->valueint;
+ DEBUG_PRINT("quality: %d\n", dashinfo->quality);
+ dashinfo->format = format_obj->valuestring;
+ DEBUG_PRINT("format: %s\n", dashinfo->format);
+ dashinfo->accept_description =
+ create_str_array(cJSON_GetArraySize(accept_quality_obj));
+ str_array_t *ac_d = &dashinfo->accept_description;
+ for (unsigned char n = 0; n < cJSON_GetArraySize(accept_description_obj);
+ n++) {
+ cJSON *i = cJSON_GetArrayItem(accept_description_obj, n);
+ if (!i) {
+ LOG("cJSON", "Read API resp_json.%s.accept_description failed.\n",
+ dashinfo_obj->string);
+ return 1;
+ }
+ set_str_element(ac_d, n, i->valuestring);
+ DEBUG_PRINT("accept_description[%hhu]: %s\n", n, get_str_element(ac_d, n));
+ }
+ dashinfo->accept_quality =
+ create_array(sizeof(int), cJSON_GetArraySize(accept_quality_obj));
+ generic_array_t *ac_q = &dashinfo->accept_quality;
+ for (unsigned char n = 0; n < cJSON_GetArraySize(accept_quality_obj); n++) {
+ cJSON *i = cJSON_GetArrayItem(accept_quality_obj, n);
+ if (!i) {
+ LOG("cJSON", "Read API resp_json.%s.accept_quality failed.\n",
+ dashinfo_obj->string);
+ return 1;
+ }
+ int *v = get_element(ac_q, n);
+ *v = i->valueint;
+ DEBUG_PRINT("accept_quality[%hhu]: %d\n", n, *v);
+ }
+ cJSON *video_obj = cJSON_GetObjectItem(dash_streams_obj, "video");
+ cJSON *audio_obj = cJSON_GetObjectItem(dash_streams_obj, "audio");
+ if (!video_obj || !audio_obj) {
+ LOG("cJSON", "Read API resp_json.%s.dash failed.\n", dashinfo_obj->string);
+ return 1;
+ }
+ dashinfo-> =
+ create_array(sizeof(Dash_stream), cJSON_GetArraySize(video_obj));
+ dashinfo-> =
+ create_array(sizeof(Dash_stream), cJSON_GetArraySize(audio_obj));
+ generic_array_t *target;
+ cJSON *dash_stream_obj;
+ for (dash_stream_obj = video_obj, target = &dashinfo->;;) {
+ int i = 0;
+ cJSON *e;
+ cJSON_ArrayForEach(e, dash_stream_obj) {
+ cJSON *id_obj = cJSON_GetObjectItem(e, "id");
+ cJSON *baseUrl_obj = cJSON_GetObjectItem(e, "baseUrl");
+ cJSON *bandwidth_obj = cJSON_GetObjectItem(e, "bandwidth");
+ cJSON *mimeType_obj = cJSON_GetObjectItem(e, "mimeType");
+ cJSON *codecid_obj = cJSON_GetObjectItem(e, "codecid");
+ cJSON *codecs_obj = cJSON_GetObjectItem(e, "codecs");
+ if (!id_obj || !baseUrl_obj || !bandwidth_obj || !mimeType_obj ||
+ !codecid_obj || !codecs_obj) {
+ LOG("cJSON", "Read API resp_json.%s.dash.%s[%d] failed.\n",
+ dashinfo_obj->string, dash_stream_obj->string, i);
+ return 1;
+ }
+ Dash_stream *ds = get_element(target, i);
+ ds->id = id_obj->valueint;
+ ds->baseUrl = baseUrl_obj->valuestring;
+ ds->bandwidth = bandwidth_obj->valueint;
+ ds->mimeType = mimeType_obj->valuestring;
+ ds->codecid = codecid_obj->valueint;
+ ds->codecs = codecs_obj->valuestring;
+ DEBUG_PRINT("%s[%d].id: %d\n", dash_stream_obj->string, i, ds->id);
+ DEBUG_PRINT("%s[%d].baseUrl: %s\n", dash_stream_obj->string, i,
+ ds->baseUrl);
+ DEBUG_PRINT("%s[%d].bandwidth: %d\n", dash_stream_obj->string, i,
+ ds->bandwidth);
+ DEBUG_PRINT("%s[%d].mimeType: %s\n", dash_stream_obj->string, i,
+ ds->mimeType);
+ DEBUG_PRINT("%s[%d].codecid: %d\n", dash_stream_obj->string, i,
+ ds->codecid);
+ DEBUG_PRINT("%s[%d].codecs: %s\n", dash_stream_obj->string, i,
+ ds->codecs);
+ i++;
+ }
+ if (dash_stream_obj == video_obj) {
+ dash_stream_obj = audio_obj;
+ target = &dashinfo->;
+ } else {
+ break;
+ }
+ }
+ return 0;
+static int get_page_in_query(char *query, int *page) {
+ const char *pattern = "p=(\\d+)";
+ str_array_t results = {0};
+ int r = regex_match(query, (str_array_t){(char **)&pattern, 1}, &results);
+ if (!r) {
+ // for (unsigned short i = 0; i < results.n; i++) {
+ // DEBUG_PRINT("%s\n", results.str[i]);
+ // }
+ *page = results.n ? atoi(results.str[0]) : 1; // Download p1 by default
+ }
+ return r;
+static int generate_api(Bilibili_options *bilibili_options, const int quality) {
+ char params[UCHAR_MAX];
+ snprintf(params, sizeof(params),
+ "avid=%d&cid=%d&bvid=%s&qn=%d&type=&otype=json&fourk=1&fnver=0&"
+ "fnval=2000",
+ bilibili_options->aid, bilibili_options->cid, bilibili_options->bvid,
+ quality);
+ bilibili_options->api = malloc(strlen(BILIBILI_API) + strlen(params) + 1);
+ strcpy(bilibili_options->api, BILIBILI_API);
+ strcat(bilibili_options->api, params);
+ return 0;
+static const char *mimeType2ext(const char *mimeType) {
+ static char mimeType_l[CHAR_MAX];
+ strcpy(mimeType_l, mimeType);
+ const char *exts[2];
+ size_t extsCount = 0;
+ char *token = strtok(mimeType_l, "/");
+ while (token != NULL && extsCount < 2) {
+ exts[extsCount++] = token;
+ token = strtok(NULL, "/");
+ }
+ if (extsCount == 2) {
+ return exts[1];
+ }
+ return "mp4"; // Cannot parse, use default
+static const char *id2quality_desc(int id) {
+ const char *desc;
+ switch (id) {
+ case 127:
+ desc = "超高清 8K";
+ break;
+ case 120:
+ desc = "超清 4K";
+ break;
+ case 112:
+ desc = "高清 1080P+";
+ break;
+ case 80:
+ desc = "高清 1080P";
+ break;
+ case 48:
+ desc = "高清 720P";
+ break;
+ case 32:
+ desc = "清晰 480P";
+ break;
+ case 16:
+ desc = "流畅 360P";
+ break;
+ default:
+ desc = "Unknown resolution";
+ break;
+ }
+ return desc;
+static void multipage_cleanup(Multipage *multipage_struct) {
+ for (unsigned short i = 0; i < multipage_struct->sections.n; i++) {
+ // free_and_nullify(multipage_struct->sections[i].episodes);
+ Multi_episode_data *section = get_element(&multipage_struct->sections, i);
+ free_array(&section->episodes);
+ }
+ free_array(&multipage_struct->sections);
+ free_array(&multipage_struct->videoData.pages);
+ cJSON_Delete(multipage_struct->json);
+ multipage_struct->json = NULL;
+static void dash_cleanup(Dash *dash) {
+ cJSON_Delete(dash->json);
+ free_str_array(&dash->dashinfo.accept_description);
+ free_array(&dash->dashinfo.accept_quality);
+ free_array(&dash->;
+ free_array(&dash->;
+static int download(Bilibili_options *bilibili_options) {
+ Dash dash = {0};
+ char *resp;
+ get(bilibili_options->api, &resp);
+ if (get_dash(resp, &dash)) {
+ LOG("Bilibili", "Get dash failed.");
+ free_and_nullify(resp);
+ dash_cleanup(&dash);
+ return 1;
+ };
+ // Download the highest resolution
+ Dash_stream *video = get_element(&, 0);
+ Dash_stream *audio = get_element(&, 0);
+ const char *quality_desc = id2quality_desc(video->id);
+ {
+ char fn[USHRT_MAX];
+ sprintf(fn, "%s[%s]-%s.%s", bilibili_options->title, quality_desc, "video",
+ mimeType2ext(video->mimeType));
+ add_url(video->baseUrl, NULL, fn, "");
+ }
+ {
+ char fn[USHRT_MAX];
+ sprintf(fn, "%s[%s]-%s.%s", bilibili_options->title,
+ quality_desc, "audio", mimeType2ext(audio->mimeType));
+ add_url(audio->baseUrl, NULL, fn, "");
+ }
+ free_and_nullify(resp);
+ dash_cleanup(&dash);
+ return 0;
+void bilibili_extract(struct options *options) {
+ Multipage multipage_struct = {0};
+ Bilibili_options bilibili_options = {options->URL};
+ int p = 1;
+ char *api;
+ if (get(options->URL, &options->pagedata)) {
+ append_log("[Bilibili] Download pagedata failed.\n");
+ return;
+ }
+ bilibili_options.html = options->pagedata;
+ if (get_multipagedata(options->pagedata, &multipage_struct,
+ &bilibili_options.is_page)) {
+ multipage_cleanup(&multipage_struct);
+ append_log("[Bilibili] Parse pagedata failed.\n");
+ return;
+ };
+ if (get_page_in_query(options->query, &p) || p < 1 ||
+ p > multipage_struct.videoData.pages.n) {
+ multipage_cleanup(&multipage_struct);
+ append_log("[Bilibili] Parse query failed.\n");
+ return;
+ }
+ Video_pages_data *page =
+ get_element(&multipage_struct.videoData.pages, p - 1);
+ bilibili_options.aid = multipage_struct.aid;
+ bilibili_options.bvid = multipage_struct.bvid;
+ bilibili_options.cid = page->cid;
+ = p;
+ bilibili_options.title = multipage_struct.videoData.title;
+ DEBUG_PRINT("aid: %d\n", bilibili_options.aid);
+ DEBUG_PRINT("bvid: %s\n", bilibili_options.bvid);
+ DEBUG_PRINT("cid: %d\n", bilibili_options.cid);
+ DEBUG_PRINT("is_page: %s\n", bilibili_options.is_page ? "yes" : "no");
+ DEBUG_PRINT("page: %d\n",;
+ DEBUG_PRINT("title: %s\n", bilibili_options.title);
+ if (generate_api(&bilibili_options, 127)) {
+ free_and_nullify(bilibili_options.api);
+ multipage_cleanup(&multipage_struct);
+ return;
+ }
+ DEBUG_PRINT("Generated API: %s\n", bilibili_options.api);
+ if (download(&bilibili_options)) {
+ free_and_nullify(bilibili_options.api);
+ multipage_cleanup(&multipage_struct);
+ return;
+ }
+ free_and_nullify(bilibili_options.api);
+ multipage_cleanup(&multipage_struct);
diff --git a/src/extractors/bilibili.h b/src/extractors/bilibili.h
new file mode 100644
index 0000000..5d609b0
--- /dev/null
+++ b/src/extractors/bilibili.h
@@ -0,0 +1,94 @@
+#ifndef BILIBILI_H_
+#define BILIBILI_H_
+#include "../utils.h"
+#include "extractor.h"
+#include <stddef.h>
+#define BILIBILI_API ""
+typedef struct video_pages_data {
+ int cid;
+ char *part;
+ int page;
+} Video_pages_data;
+typedef struct multipage_video_data {
+ char *title;
+ generic_array_t pages;
+} Multipage_video_data;
+typedef struct episode {
+ int aid;
+ char *bvid;
+ int cid;
+ char *title;
+} Episode;
+typedef struct multi_episode_data {
+ int season_id;
+ generic_array_t episodes;
+} Multi_episode_data;
+typedef struct multipage {
+ int aid;
+ char *bvid;
+ generic_array_t sections;
+ Multipage_video_data videoData;
+ cJSON *json;
+} Multipage;
+typedef struct bilibili_options {
+ char *url;
+ char *html;
+ char *api;
+ char *cookie;
+ bool is_bangumi;
+ bool is_page;
+ int aid;
+ int cid;
+ char *bvid;
+ int page;
+ char *title;
+} Bilibili_options;
+typedef struct durl {
+ char *url;
+ size_t size;
+} Durl;
+typedef struct dash_stream {
+ int id;
+ char *baseUrl;
+ int bandwidth;
+ char *mimeType;
+ int codecid;
+ char *codecs;
+} Dash_stream;
+typedef struct dash_streams {
+ generic_array_t video;
+ generic_array_t audio;
+} Dash_streams;
+typedef struct dash_info {
+ int quality;
+ str_array_t accept_description;
+ generic_array_t accept_quality;
+ Dash_streams dash;
+ char *format;
+ generic_array_t durl;
+} Dash_info;
+typedef struct dash {
+ int code;
+ char *message;
+ Dash_info dashinfo;
+ cJSON *json;
+} Dash;
+void bilibili_extract(struct options *);
diff --git a/src/extractors/extractor.c b/src/extractors/extractor.c
new file mode 100644
index 0000000..91f34d2
--- /dev/null
+++ b/src/extractors/extractor.c
@@ -0,0 +1,24 @@
+#include <stdlib.h>
+#include "bilibili.h"
+#include "extractor.h"
+Site_map site_map = {{{"", SITE_BILIBILI}}, 1};
+void options_cleanup(Options *options) {
+ free_and_nullify(options->URL);
+ free_and_nullify(options->path);
+ free_and_nullify(options->query);
+ free_and_nullify(options->pagedata);
+int extract(void *v) {
+ Options *options = (Options *)v;
+ switch (options->site) {
+ bilibili_extract(options);
+ break;
+ }
+ options_cleanup(options);
+ return 0;
diff --git a/src/extractors/extractor.h b/src/extractors/extractor.h
new file mode 100644
index 0000000..d3ebeec
--- /dev/null
+++ b/src/extractors/extractor.h
@@ -0,0 +1,32 @@
+#ifndef EXTRACTOR_H_
+#define EXTRACTOR_H_
+#include <cjson/cJSON.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
+enum site { SITE_BILIBILI };
+typedef enum site site_t;
+typedef struct site_map {
+ struct {
+ char domain[SHRT_MAX];
+ site_t site;
+ } pairs[1];
+ unsigned char size;
+} Site_map;
+typedef struct options {
+ site_t site;
+ char *URL;
+ char *path;
+ char *query;
+ char *pagedata;
+} Options;
+void options_cleanup(Options*);
+int extract(void *);