summaryrefslogtreecommitdiff
path: root/src/process_url.c
diff options
context:
space:
mode:
authorMole Shang <[email protected]>2023-07-25 09:27:26 +0800
committerMole Shang <[email protected]>2023-08-05 23:19:46 +0800
commited8f6df90b0c39835198d5b7af4bbd391362f180 (patch)
tree907ba31bac854eb5dc8a2781825e24c049b10580 /src/process_url.c
downloadhinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.gz
hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.bz2
hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.zip
hinata: initial commit
Diffstat (limited to 'src/process_url.c')
-rw-r--r--src/process_url.c526
1 files changed, 526 insertions, 0 deletions
diff --git a/src/process_url.c b/src/process_url.c
new file mode 100644
index 0000000..4bfce8d
--- /dev/null
+++ b/src/process_url.c
@@ -0,0 +1,526 @@
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <curl/header.h>
+#include <curl/system.h>
+#include <curl/urlapi.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include "c11threads.h"
+#else
+#include <threads.h>
+#endif
+
+#include "nuklear.h"
+
+#include "constants.h"
+#include "extractors/extractor.h"
+#include "logger.h"
+#include "process_url.h"
+
+/* NOTICE: the global curl_conf pointer will only stay valid during downloading,
+ * otherwise, ALWAYS point it to NULL. */
+static curl_conf_t *curl_conf;
+extern Site_map site_map;
+Options options;
+static queue_t dl_queue;
+
+thrd_t tid[MAX_THREAD];
+mtx_t mtx;
+cnd_t cnd;
+bool corrupted;
+static const char *outdir_g, *referer_g;
+static CURLU *h;
+
+/*NOTE: Use logger(X) (defined as a generic macro) to log errors. */
+static bool logerr_b(CURLcode r) {
+ if (r && !corrupted) {
+ LOG("libcurl", "Error %d: %s\n", r, ERRTOSTRING(r));
+ corrupted = true;
+ }
+ return r;
+}
+
+static bool logerr_h(CURLHcode r) {
+ if (r) {
+ const char *err_str;
+ switch (r) {
+ case CURLHE_BADINDEX:
+ err_str = "header exists but not with this index";
+ break;
+ case CURLHE_MISSING:
+ // Allow no headers
+ err_str = "no such header exists";
+ DEBUG_PRINT("Header Error %d: %s\n", r, err_str);
+ return r;
+ break;
+ case CURLHE_NOHEADERS:
+ err_str = "no headers at all exist (yet)";
+ break;
+ case CURLHE_NOREQUEST:
+ err_str = "no request with this number was used";
+ break;
+ case CURLHE_OUT_OF_MEMORY:
+ err_str = "out of memory while processing";
+ break;
+ case CURLHE_BAD_ARGUMENT:
+ err_str = "a function argument was not okay";
+ break;
+ case CURLHE_NOT_BUILT_IN:
+ err_str = "if API was disabled in the build";
+ break;
+ default:
+ err_str = "unknown error";
+ break;
+ }
+ LOG("libcurl", "Header Error %d: %s\n", r, err_str);
+ corrupted = true;
+ }
+ return r;
+}
+
+static bool logerr_u(CURLUcode r) {
+ switch (r) {
+ case CURLUE_NO_QUERY:
+ // Accept no query
+ DEBUG_PRINT("The URL has no query.\n");
+ break;
+ case 0:
+ break;
+ default:
+ LOG("libcurl", "Parse Error %d: Invalid URL\n", r);
+ break;
+ }
+ return r;
+}
+
+static void curl_easy_setcommonopts(CURL *curl) {
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1L);
+ curl_easy_setopt(
+ curl, CURLOPT_USERAGENT,
+ "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0");
+ curl_easy_setopt(curl, CURLOPT_REFERER, referer_g);
+ /* enable all supported built-in compressions,
+ * since serveral sites enable gzip encoding */
+ curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+}
+
+static int progress_callback(void *clientp, curl_off_t dltotal,
+ curl_off_t dlnow, curl_off_t ultotal,
+ curl_off_t ulnow) {
+ thrd_info_t *ti = (thrd_info_t *)clientp;
+ ti->curl_c->dlnow_per_thrd[ti->no] = dlnow;
+ if (ti->curl_c->total_thrd == 1) {
+ ti->curl_c->dltotal = dltotal;
+ }
+ // Return non-zero to abort download
+ return corrupted;
+}
+
+static size_t write2str(void *ptr, size_t size, size_t nmemb, str_data_t *s) {
+ size_t new_len = s->len + size * nmemb;
+ s->string = realloc(s->string, new_len + 1);
+ memcpy(s->string + s->len, ptr, size * nmemb);
+ s->string[new_len] = '\0';
+ s->len = new_len;
+
+ return size * nmemb;
+}
+
+static int parse_url(const char *URL, const char *outdir, char *fn) {
+ CURLUcode ue = logerr(curl_url_set(h, CURLUPART_URL, URL, 0));
+ if (ue && ue != CURLUE_NO_QUERY) {
+ return 1;
+ }
+ char *domain, *path, *query;
+
+ if (ue == CURLUE_NO_QUERY) {
+ query = NULL;
+ } else {
+ ue = logerr(curl_url_get(h, CURLUPART_QUERY, &query, 0));
+ }
+ ue = curl_url_get(h, CURLUPART_HOST, &domain, 0);
+ if (ue) {
+ return 1;
+ }
+ ue = logerr(curl_url_get(h, CURLUPART_PATH, &path, 0));
+ if (ue) {
+ return 1;
+ }
+
+ DEBUG_PRINT("Domain: %s\n", domain);
+ DEBUG_PRINT("Path: %s\n", path);
+ DEBUG_PRINT("Query: %s\n", query);
+
+ for (unsigned short i = 0; i < site_map.size; i++) {
+ if (!strcmp(domain, site_map.pairs[i].domain)) {
+ append_log("Got site: %s\n", domain);
+ thrd_t t;
+ options.site = site_map.pairs[i].site;
+ options.URL = malloc(strlen(domain) + strlen(path) + 10);
+ sprintf(options.URL, "https://%s%s", domain, path);
+ options.path = malloc(strlen(path) + 1);
+ strcpy(options.path, path);
+ if (query) {
+ options.query = malloc(strlen(query) + 1);
+ strcpy(options.query, query);
+ } else {
+ options.query = calloc(1, sizeof(char));
+ }
+
+ append_log("pagedata URL: %s\n", options.URL);
+
+ thrd_create(&t, extract, &options);
+ thrd_detach(t);
+
+ curl_free(domain);
+ curl_free(path);
+ curl_free(query);
+ return 0;
+ };
+ }
+
+ curl_conf_t *curl_c = malloc(sizeof(curl_conf_t));
+ curl_c->URL = malloc(strlen(URL) + 1);
+ strcpy(curl_c->URL, URL);
+
+ /* filename */
+
+ if (fn == NULL) {
+ const char *patterns_str[1] = {"(?:.+\\/)([^#/?]+)"};
+ str_array_t results = create_str_array(0);
+ const str_array_t patterns = {(char **)patterns_str, 1};
+ regex_match(path, patterns, &results);
+ for (unsigned short i = 0; i < results.n; i++) {
+ if (results.str[i]) {
+ DEBUG_PRINT("[%d] %s\n", i, results.str[i]);
+ sprintf(curl_c->outfn, "%s%s%s", outdir,
+ outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR,
+ results.str[i]);
+ }
+ }
+ free_str_array(&results);
+ if (curl_c->outfn[0] == '\0') {
+ // sprintf(curl_c->outfn, "%s%c%s", outdir, SPLITTER,
+ // "test");
+ LOG("libcurl",
+ "Infer filename failed, please specify a valid filename.\n");
+ curl_free(domain);
+ curl_free(path);
+ curl_free(query);
+ return 1;
+ }
+ } else {
+ sprintf(curl_c->outfn, "%s%s%s", outdir,
+ outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR,
+ fn);
+ free_and_nullify(fn);
+ }
+ DEBUG_PRINT("File will be saved as: %s\n", curl_c->outfn);
+ DEBUG_PRINT("Got regular URL: %s\n", curl_c->URL);
+
+ enqueue(&dl_queue, (void *)curl_c);
+
+ curl_free(domain);
+ curl_free(path);
+ curl_free(query);
+
+ return 0;
+}
+
+static bool get_info(const char *URL, long *psize) {
+ CURL *curl;
+ long resp_code;
+ bool support_range = false;
+ struct curl_header *pch;
+ curl = curl_easy_init();
+ curl_easy_setopt(curl, CURLOPT_URL, URL);
+ curl_easy_setcommonopts(curl);
+ curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
+ CURLcode r = curl_easy_perform(curl);
+ if (logerr(r)) {
+ curl_easy_cleanup(curl);
+ return support_range;
+ }
+ r = curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T,
+ (curl_off_t *)psize);
+ if (logerr(r)) {
+ curl_easy_cleanup(curl);
+ return support_range;
+ }
+ CURLHcode rh =
+ curl_easy_header(curl, "Accept-Ranges", 0, CURLH_HEADER, -1, &pch);
+ if (logerr(rh) || strcmp(pch->value, "bytes")) {
+ curl_easy_cleanup(curl);
+ return support_range;
+ }
+ char *ct = NULL;
+ r = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &ct);
+ if (logerr(r)) {
+ curl_easy_cleanup(curl);
+ return support_range;
+ }
+
+ support_range = true;
+ curl_easy_cleanup(curl);
+ return support_range;
+}
+
+static int pull_part(void *a) {
+ CURLcode res;
+ thrd_info_t *ti = (thrd_info_t *)a;
+ curl_conf_t *curl_c = ti->curl_c;
+ unsigned char n = ti->no;
+ // Here we need to manually control str_array_t
+ curl_c->partfn.str[n] = malloc(strlen(curl_c->outfn) + 4);
+ sprintf(curl_c->partfn.str[n], "%s.%d", curl_c->outfn, n);
+ DEBUG_PRINT("[THRD %hhu] partfn: %s, range: %s\n", n,
+ get_str_element(&curl_c->partfn, n), ti->range);
+ {
+ curl_c->fplist[n] = fopen(get_str_element(&curl_c->partfn, n), "wb+");
+ CURL *curl;
+
+ curl = curl_easy_init();
+ curl_easy_setopt(curl, CURLOPT_URL, curl_c->URL);
+ curl_easy_setcommonopts(curl);
+ curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 60L);
+ curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, curl_c->fplist[n]);
+ curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+ if (ti->curl_c->total_thrd != 1) {
+ curl_easy_setopt(curl, CURLOPT_RANGE, ti->range);
+ }
+ curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
+ curl_easy_setopt(curl, CURLOPT_XFERINFODATA, ti);
+ res = curl_easy_perform(curl);
+ rewind(curl_c->fplist[n]);
+ append_log("[THRD %hhu] File downloaded.\n", n);
+ curl_easy_cleanup(curl);
+ logerr(res);
+ }
+ mtx_lock(&mtx);
+ curl_c->success_thrd += 1;
+ cnd_signal(&cnd); // Unblocks the waiting cleanup thread. If no threads are
+ // blocked, does nothing and returns thrd_success.
+
+ mtx_unlock(&mtx);
+ return (int)res;
+}
+
+static int merge_and_cleanup(curl_conf_t *curl_c) {
+ if (corrupted) {
+ append_log("Cancelling...\n");
+ } else {
+ append_log("Merging files...\n");
+ }
+
+ FILE *fop;
+ fop = fopen(curl_c->outfn, "wb");
+ if (fop == NULL) {
+ // User quitted before downloading, return directly
+ return 1;
+ }
+ for (unsigned short i = 0; i < curl_c->total_thrd; i++) {
+ if (!corrupted) {
+ char buffer[1024];
+ size_t bytesRead = 0;
+ while ((bytesRead = fread(buffer, 1, sizeof(buffer), curl_c->fplist[i])) >
+ 0) {
+ fwrite(buffer, 1, bytesRead, fop);
+ }
+ }
+ fclose(curl_c->fplist[i]);
+ if (remove(get_str_element(&curl_c->partfn, i)) != 0) {
+ append_log("Error deleting partial file %s\n",
+ get_str_element(&curl_c->partfn, i));
+ }
+ }
+ fclose(fop);
+
+ if (corrupted) {
+ // Also delete dst file
+ if (remove(curl_c->outfn) != 0) {
+ append_log("Error deleting file %s\n", curl_c->outfn);
+ }
+ }
+ // Reset stat
+ corrupted = false;
+ curl_c->success_thrd = 0;
+ curl_c->total_thrd = 0;
+ free_and_nullify(curl_c->URL);
+
+ return 0;
+}
+
+static int download(curl_conf_t *curl_c) {
+ /* Reset thrd info. */
+ // if (curl_c->success_thrd == curl_c->total_thrd) {
+ curl_c->success_thrd = 0;
+ // }
+
+ CURL *curl;
+ curl_off_t cl = 0L, begin = 0L, end;
+
+ static thrd_info_t thrd_info[MAX_THREAD] = {0};
+
+ bool support_range = get_info(curl_c->URL, &cl);
+ DEBUG_PRINT("Size: %ld bytes.\n", cl);
+ if (support_range && cl > 0L) {
+ curl_c->dltotal = cl;
+ curl_c->total_thrd = (unsigned char)CEIL_DIV(cl, MAX_THREAD_SIZE);
+ if (curl_c->total_thrd > MAX_THREAD) {
+ curl_c->total_thrd = MAX_THREAD;
+ }
+ LOG("libcurl", "Server supports range header, setting threads to %hhu\n",
+ curl_c->total_thrd);
+ } else {
+ LOG("libcurl", "Server doesn't claim range header "
+ "support, falling back to single thread.\n");
+ curl_c->total_thrd = 1;
+ }
+ curl_off_t size_per_thrd = (cl / curl_c->total_thrd);
+
+ curl_c->partfn = create_str_array(curl_c->total_thrd);
+
+ for (unsigned char i = 0; i < curl_c->total_thrd; i++) {
+ curl_off_t chunk_size;
+ thrd_info[i].no = i;
+ if (i + 1 == curl_c->total_thrd)
+ chunk_size = cl - (curl_c->total_thrd - 1) * size_per_thrd;
+ else
+ chunk_size = size_per_thrd;
+ end = begin + chunk_size - 1;
+ if (curl_c->total_thrd != 1) {
+ sprintf(thrd_info[i].range,
+ "%" CURL_FORMAT_CURL_OFF_T "-%" CURL_FORMAT_CURL_OFF_T, begin,
+ end);
+ }
+ thrd_info[i].curl_c = curl_c;
+ int error = thrd_create(&tid[i], pull_part, &thrd_info[i]);
+ if (error)
+ append_log("Couldn't run thread number %d, errno %d\n", i, error);
+ begin = end + 1;
+ }
+ return 0;
+}
+
+void curl_init(curl_conf_t *curl) {
+ curl_global_init(CURL_GLOBAL_ALL);
+ h = curl_url();
+ dl_queue = create_queue();
+ mtx_init(&mtx, mtx_plain);
+ cnd_init(&cnd);
+}
+
+void curl_cleanup(status_t *stat) {
+ /* We only need to cleanup
+ * the currently active thread. */
+ if (curl_conf) {
+
+ corrupted = true; // In case libcurl is still downloading
+ /* Now Wait for all threads to cancel... */
+ mtx_lock(&mtx);
+ while (curl_conf->success_thrd != curl_conf->total_thrd) {
+ cnd_wait(&cnd, &mtx);
+ }
+ mtx_unlock(&mtx);
+ if (!stat->is_done) {
+ merge_and_cleanup(curl_conf);
+ }
+ mtx_destroy(&mtx);
+ cnd_destroy(&cnd);
+ }
+ free_queue(&dl_queue);
+ curl_url_cleanup(h);
+ curl_global_cleanup();
+}
+
+void poll_status(status_t *stat) {
+ if (!is_empty_queue(&dl_queue) && stat->is_done) {
+ /* extract_done is a flag used to signal that
+ * the extractor process is done. */
+ curl_conf = (curl_conf_t *)dequeue(&dl_queue);
+ if (download(curl_conf)) {
+ // Something went wrong when creating download task
+ DEBUG_PRINT("Creating download task failed.\n");
+ };
+ stat->is_done = false;
+ }
+ if (curl_conf) {
+ curl_conf->dlnow = 0L;
+ for (unsigned char i = 0; i < curl_conf->total_thrd; i++) {
+ curl_conf->dlnow += curl_conf->dlnow_per_thrd[i];
+ }
+ stat->cur = curl_conf->dlnow;
+ stat->total = curl_conf->dltotal;
+ DEBUG_PRINT("success_thrd: %hhu, total_thrd: %hhu, is_done: %s\n",
+ curl_conf->success_thrd, curl_conf->total_thrd,
+ stat->is_done ? "yes" : "no");
+ mtx_lock(&mtx);
+ if (curl_conf->success_thrd == curl_conf->total_thrd &&
+ (curl_conf->total_thrd && !stat->is_done)) {
+ stat->is_done = true;
+ for (unsigned short i = 0; i < curl_conf->total_thrd; i++) {
+ int r;
+ thrd_join(tid[i], &r);
+ }
+ merge_and_cleanup(curl_conf);
+ append_log("Download %s finished.\n", curl_conf->outfn);
+ curl_conf = NULL;
+ }
+ mtx_unlock(&mtx);
+ }
+}
+
+int get(const char *URL, char **pdstr) {
+ CURL *curl = curl_easy_init();
+ str_data_t pagedata = {0};
+ pagedata.string = malloc(1);
+ pagedata.string[0] = '\0';
+ curl_easy_setopt(curl, CURLOPT_URL, URL);
+ curl_easy_setcommonopts(curl);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write2str);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&pagedata);
+ CURLcode res = logerr(curl_easy_perform(curl));
+ *pdstr = malloc(pagedata.len + 1);
+ strcpy(*pdstr, pagedata.string);
+ curl_easy_cleanup(curl);
+ return res;
+}
+
+/* Add an URL to dl_queue.
+ * - If outdir is NULL or a empty string, reuse the cached outdir_g
+ * - If fn is NULL or a empty string, infer the filename from URL (otherwise
+ * fail and quit')
+ * - If referer is NULL or a empty string, uses NULL */
+void add_url(const char *URL, const char *outdir, const char *fn,
+ const char *referer) {
+ if (outdir && outdir[0] != '\0') {
+ outdir_g = outdir;
+ }
+ referer_g = referer;
+ if (referer && referer[0] == '\0') {
+ referer_g = NULL;
+ }
+ DEBUG_PRINT("referer_g: %s\n", referer_g);
+
+ char *filename;
+ if (fn == NULL || fn[0] == '\0') {
+ filename = NULL;
+ } else {
+ filename = malloc(strlen(fn) + 1);
+ strcpy(filename, fn);
+ }
+
+ // Pass our cache (outdir_g) to parse_url()
+ if (parse_url(URL, outdir_g, filename)) {
+ DEBUG_PRINT("parse_url() failed with error.\n");
+ return; // Parse failed, quit the task directly
+ };
+}