diff options
author | Mole Shang <[email protected]> | 2023-07-25 09:27:26 +0800 |
---|---|---|
committer | Mole Shang <[email protected]> | 2023-08-05 23:19:46 +0800 |
commit | ed8f6df90b0c39835198d5b7af4bbd391362f180 (patch) | |
tree | 907ba31bac854eb5dc8a2781825e24c049b10580 /src/process_url.c | |
download | hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.gz hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.bz2 hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.zip |
hinata: initial commit
Diffstat (limited to 'src/process_url.c')
-rw-r--r-- | src/process_url.c | 526 |
1 files changed, 526 insertions, 0 deletions
diff --git a/src/process_url.c b/src/process_url.c new file mode 100644 index 0000000..4bfce8d --- /dev/null +++ b/src/process_url.c @@ -0,0 +1,526 @@ +#include <curl/curl.h> +#include <curl/easy.h> +#include <curl/header.h> +#include <curl/system.h> +#include <curl/urlapi.h> +#include <limits.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#ifdef _WIN32 +#include "c11threads.h" +#else +#include <threads.h> +#endif + +#include "nuklear.h" + +#include "constants.h" +#include "extractors/extractor.h" +#include "logger.h" +#include "process_url.h" + +/* NOTICE: the global curl_conf pointer will only stay valid during downloading, + * otherwise, ALWAYS point it to NULL. */ +static curl_conf_t *curl_conf; +extern Site_map site_map; +Options options; +static queue_t dl_queue; + +thrd_t tid[MAX_THREAD]; +mtx_t mtx; +cnd_t cnd; +bool corrupted; +static const char *outdir_g, *referer_g; +static CURLU *h; + +/*NOTE: Use logger(X) (defined as a generic macro) to log errors. */ +static bool logerr_b(CURLcode r) { + if (r && !corrupted) { + LOG("libcurl", "Error %d: %s\n", r, ERRTOSTRING(r)); + corrupted = true; + } + return r; +} + +static bool logerr_h(CURLHcode r) { + if (r) { + const char *err_str; + switch (r) { + case CURLHE_BADINDEX: + err_str = "header exists but not with this index"; + break; + case CURLHE_MISSING: + // Allow no headers + err_str = "no such header exists"; + DEBUG_PRINT("Header Error %d: %s\n", r, err_str); + return r; + break; + case CURLHE_NOHEADERS: + err_str = "no headers at all exist (yet)"; + break; + case CURLHE_NOREQUEST: + err_str = "no request with this number was used"; + break; + case CURLHE_OUT_OF_MEMORY: + err_str = "out of memory while processing"; + break; + case CURLHE_BAD_ARGUMENT: + err_str = "a function argument was not okay"; + break; + case CURLHE_NOT_BUILT_IN: + err_str = "if API was disabled in the build"; + break; + default: + err_str = "unknown error"; + break; + } + LOG("libcurl", "Header Error %d: %s\n", r, err_str); + corrupted = true; + } + return r; +} + +static bool logerr_u(CURLUcode r) { + switch (r) { + case CURLUE_NO_QUERY: + // Accept no query + DEBUG_PRINT("The URL has no query.\n"); + break; + case 0: + break; + default: + LOG("libcurl", "Parse Error %d: Invalid URL\n", r); + break; + } + return r; +} + +static void curl_easy_setcommonopts(CURL *curl) { + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1L); + curl_easy_setopt( + curl, CURLOPT_USERAGENT, + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0"); + curl_easy_setopt(curl, CURLOPT_REFERER, referer_g); + /* enable all supported built-in compressions, + * since serveral sites enable gzip encoding */ + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); +} + +static int progress_callback(void *clientp, curl_off_t dltotal, + curl_off_t dlnow, curl_off_t ultotal, + curl_off_t ulnow) { + thrd_info_t *ti = (thrd_info_t *)clientp; + ti->curl_c->dlnow_per_thrd[ti->no] = dlnow; + if (ti->curl_c->total_thrd == 1) { + ti->curl_c->dltotal = dltotal; + } + // Return non-zero to abort download + return corrupted; +} + +static size_t write2str(void *ptr, size_t size, size_t nmemb, str_data_t *s) { + size_t new_len = s->len + size * nmemb; + s->string = realloc(s->string, new_len + 1); + memcpy(s->string + s->len, ptr, size * nmemb); + s->string[new_len] = '\0'; + s->len = new_len; + + return size * nmemb; +} + +static int parse_url(const char *URL, const char *outdir, char *fn) { + CURLUcode ue = logerr(curl_url_set(h, CURLUPART_URL, URL, 0)); + if (ue && ue != CURLUE_NO_QUERY) { + return 1; + } + char *domain, *path, *query; + + if (ue == CURLUE_NO_QUERY) { + query = NULL; + } else { + ue = logerr(curl_url_get(h, CURLUPART_QUERY, &query, 0)); + } + ue = curl_url_get(h, CURLUPART_HOST, &domain, 0); + if (ue) { + return 1; + } + ue = logerr(curl_url_get(h, CURLUPART_PATH, &path, 0)); + if (ue) { + return 1; + } + + DEBUG_PRINT("Domain: %s\n", domain); + DEBUG_PRINT("Path: %s\n", path); + DEBUG_PRINT("Query: %s\n", query); + + for (unsigned short i = 0; i < site_map.size; i++) { + if (!strcmp(domain, site_map.pairs[i].domain)) { + append_log("Got site: %s\n", domain); + thrd_t t; + options.site = site_map.pairs[i].site; + options.URL = malloc(strlen(domain) + strlen(path) + 10); + sprintf(options.URL, "https://%s%s", domain, path); + options.path = malloc(strlen(path) + 1); + strcpy(options.path, path); + if (query) { + options.query = malloc(strlen(query) + 1); + strcpy(options.query, query); + } else { + options.query = calloc(1, sizeof(char)); + } + + append_log("pagedata URL: %s\n", options.URL); + + thrd_create(&t, extract, &options); + thrd_detach(t); + + curl_free(domain); + curl_free(path); + curl_free(query); + return 0; + }; + } + + curl_conf_t *curl_c = malloc(sizeof(curl_conf_t)); + curl_c->URL = malloc(strlen(URL) + 1); + strcpy(curl_c->URL, URL); + + /* filename */ + + if (fn == NULL) { + const char *patterns_str[1] = {"(?:.+\\/)([^#/?]+)"}; + str_array_t results = create_str_array(0); + const str_array_t patterns = {(char **)patterns_str, 1}; + regex_match(path, patterns, &results); + for (unsigned short i = 0; i < results.n; i++) { + if (results.str[i]) { + DEBUG_PRINT("[%d] %s\n", i, results.str[i]); + sprintf(curl_c->outfn, "%s%s%s", outdir, + outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR, + results.str[i]); + } + } + free_str_array(&results); + if (curl_c->outfn[0] == '\0') { + // sprintf(curl_c->outfn, "%s%c%s", outdir, SPLITTER, + // "test"); + LOG("libcurl", + "Infer filename failed, please specify a valid filename.\n"); + curl_free(domain); + curl_free(path); + curl_free(query); + return 1; + } + } else { + sprintf(curl_c->outfn, "%s%s%s", outdir, + outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR, + fn); + free_and_nullify(fn); + } + DEBUG_PRINT("File will be saved as: %s\n", curl_c->outfn); + DEBUG_PRINT("Got regular URL: %s\n", curl_c->URL); + + enqueue(&dl_queue, (void *)curl_c); + + curl_free(domain); + curl_free(path); + curl_free(query); + + return 0; +} + +static bool get_info(const char *URL, long *psize) { + CURL *curl; + long resp_code; + bool support_range = false; + struct curl_header *pch; + curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, URL); + curl_easy_setcommonopts(curl); + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL); + CURLcode r = curl_easy_perform(curl); + if (logerr(r)) { + curl_easy_cleanup(curl); + return support_range; + } + r = curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, + (curl_off_t *)psize); + if (logerr(r)) { + curl_easy_cleanup(curl); + return support_range; + } + CURLHcode rh = + curl_easy_header(curl, "Accept-Ranges", 0, CURLH_HEADER, -1, &pch); + if (logerr(rh) || strcmp(pch->value, "bytes")) { + curl_easy_cleanup(curl); + return support_range; + } + char *ct = NULL; + r = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &ct); + if (logerr(r)) { + curl_easy_cleanup(curl); + return support_range; + } + + support_range = true; + curl_easy_cleanup(curl); + return support_range; +} + +static int pull_part(void *a) { + CURLcode res; + thrd_info_t *ti = (thrd_info_t *)a; + curl_conf_t *curl_c = ti->curl_c; + unsigned char n = ti->no; + // Here we need to manually control str_array_t + curl_c->partfn.str[n] = malloc(strlen(curl_c->outfn) + 4); + sprintf(curl_c->partfn.str[n], "%s.%d", curl_c->outfn, n); + DEBUG_PRINT("[THRD %hhu] partfn: %s, range: %s\n", n, + get_str_element(&curl_c->partfn, n), ti->range); + { + curl_c->fplist[n] = fopen(get_str_element(&curl_c->partfn, n), "wb+"); + CURL *curl; + + curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, curl_c->URL); + curl_easy_setcommonopts(curl); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 60L); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, curl_c->fplist[n]); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + if (ti->curl_c->total_thrd != 1) { + curl_easy_setopt(curl, CURLOPT_RANGE, ti->range); + } + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, ti); + res = curl_easy_perform(curl); + rewind(curl_c->fplist[n]); + append_log("[THRD %hhu] File downloaded.\n", n); + curl_easy_cleanup(curl); + logerr(res); + } + mtx_lock(&mtx); + curl_c->success_thrd += 1; + cnd_signal(&cnd); // Unblocks the waiting cleanup thread. If no threads are + // blocked, does nothing and returns thrd_success. + + mtx_unlock(&mtx); + return (int)res; +} + +static int merge_and_cleanup(curl_conf_t *curl_c) { + if (corrupted) { + append_log("Cancelling...\n"); + } else { + append_log("Merging files...\n"); + } + + FILE *fop; + fop = fopen(curl_c->outfn, "wb"); + if (fop == NULL) { + // User quitted before downloading, return directly + return 1; + } + for (unsigned short i = 0; i < curl_c->total_thrd; i++) { + if (!corrupted) { + char buffer[1024]; + size_t bytesRead = 0; + while ((bytesRead = fread(buffer, 1, sizeof(buffer), curl_c->fplist[i])) > + 0) { + fwrite(buffer, 1, bytesRead, fop); + } + } + fclose(curl_c->fplist[i]); + if (remove(get_str_element(&curl_c->partfn, i)) != 0) { + append_log("Error deleting partial file %s\n", + get_str_element(&curl_c->partfn, i)); + } + } + fclose(fop); + + if (corrupted) { + // Also delete dst file + if (remove(curl_c->outfn) != 0) { + append_log("Error deleting file %s\n", curl_c->outfn); + } + } + // Reset stat + corrupted = false; + curl_c->success_thrd = 0; + curl_c->total_thrd = 0; + free_and_nullify(curl_c->URL); + + return 0; +} + +static int download(curl_conf_t *curl_c) { + /* Reset thrd info. */ + // if (curl_c->success_thrd == curl_c->total_thrd) { + curl_c->success_thrd = 0; + // } + + CURL *curl; + curl_off_t cl = 0L, begin = 0L, end; + + static thrd_info_t thrd_info[MAX_THREAD] = {0}; + + bool support_range = get_info(curl_c->URL, &cl); + DEBUG_PRINT("Size: %ld bytes.\n", cl); + if (support_range && cl > 0L) { + curl_c->dltotal = cl; + curl_c->total_thrd = (unsigned char)CEIL_DIV(cl, MAX_THREAD_SIZE); + if (curl_c->total_thrd > MAX_THREAD) { + curl_c->total_thrd = MAX_THREAD; + } + LOG("libcurl", "Server supports range header, setting threads to %hhu\n", + curl_c->total_thrd); + } else { + LOG("libcurl", "Server doesn't claim range header " + "support, falling back to single thread.\n"); + curl_c->total_thrd = 1; + } + curl_off_t size_per_thrd = (cl / curl_c->total_thrd); + + curl_c->partfn = create_str_array(curl_c->total_thrd); + + for (unsigned char i = 0; i < curl_c->total_thrd; i++) { + curl_off_t chunk_size; + thrd_info[i].no = i; + if (i + 1 == curl_c->total_thrd) + chunk_size = cl - (curl_c->total_thrd - 1) * size_per_thrd; + else + chunk_size = size_per_thrd; + end = begin + chunk_size - 1; + if (curl_c->total_thrd != 1) { + sprintf(thrd_info[i].range, + "%" CURL_FORMAT_CURL_OFF_T "-%" CURL_FORMAT_CURL_OFF_T, begin, + end); + } + thrd_info[i].curl_c = curl_c; + int error = thrd_create(&tid[i], pull_part, &thrd_info[i]); + if (error) + append_log("Couldn't run thread number %d, errno %d\n", i, error); + begin = end + 1; + } + return 0; +} + +void curl_init(curl_conf_t *curl) { + curl_global_init(CURL_GLOBAL_ALL); + h = curl_url(); + dl_queue = create_queue(); + mtx_init(&mtx, mtx_plain); + cnd_init(&cnd); +} + +void curl_cleanup(status_t *stat) { + /* We only need to cleanup + * the currently active thread. */ + if (curl_conf) { + + corrupted = true; // In case libcurl is still downloading + /* Now Wait for all threads to cancel... */ + mtx_lock(&mtx); + while (curl_conf->success_thrd != curl_conf->total_thrd) { + cnd_wait(&cnd, &mtx); + } + mtx_unlock(&mtx); + if (!stat->is_done) { + merge_and_cleanup(curl_conf); + } + mtx_destroy(&mtx); + cnd_destroy(&cnd); + } + free_queue(&dl_queue); + curl_url_cleanup(h); + curl_global_cleanup(); +} + +void poll_status(status_t *stat) { + if (!is_empty_queue(&dl_queue) && stat->is_done) { + /* extract_done is a flag used to signal that + * the extractor process is done. */ + curl_conf = (curl_conf_t *)dequeue(&dl_queue); + if (download(curl_conf)) { + // Something went wrong when creating download task + DEBUG_PRINT("Creating download task failed.\n"); + }; + stat->is_done = false; + } + if (curl_conf) { + curl_conf->dlnow = 0L; + for (unsigned char i = 0; i < curl_conf->total_thrd; i++) { + curl_conf->dlnow += curl_conf->dlnow_per_thrd[i]; + } + stat->cur = curl_conf->dlnow; + stat->total = curl_conf->dltotal; + DEBUG_PRINT("success_thrd: %hhu, total_thrd: %hhu, is_done: %s\n", + curl_conf->success_thrd, curl_conf->total_thrd, + stat->is_done ? "yes" : "no"); + mtx_lock(&mtx); + if (curl_conf->success_thrd == curl_conf->total_thrd && + (curl_conf->total_thrd && !stat->is_done)) { + stat->is_done = true; + for (unsigned short i = 0; i < curl_conf->total_thrd; i++) { + int r; + thrd_join(tid[i], &r); + } + merge_and_cleanup(curl_conf); + append_log("Download %s finished.\n", curl_conf->outfn); + curl_conf = NULL; + } + mtx_unlock(&mtx); + } +} + +int get(const char *URL, char **pdstr) { + CURL *curl = curl_easy_init(); + str_data_t pagedata = {0}; + pagedata.string = malloc(1); + pagedata.string[0] = '\0'; + curl_easy_setopt(curl, CURLOPT_URL, URL); + curl_easy_setcommonopts(curl); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write2str); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&pagedata); + CURLcode res = logerr(curl_easy_perform(curl)); + *pdstr = malloc(pagedata.len + 1); + strcpy(*pdstr, pagedata.string); + curl_easy_cleanup(curl); + return res; +} + +/* Add an URL to dl_queue. + * - If outdir is NULL or a empty string, reuse the cached outdir_g + * - If fn is NULL or a empty string, infer the filename from URL (otherwise + * fail and quit') + * - If referer is NULL or a empty string, uses NULL */ +void add_url(const char *URL, const char *outdir, const char *fn, + const char *referer) { + if (outdir && outdir[0] != '\0') { + outdir_g = outdir; + } + referer_g = referer; + if (referer && referer[0] == '\0') { + referer_g = NULL; + } + DEBUG_PRINT("referer_g: %s\n", referer_g); + + char *filename; + if (fn == NULL || fn[0] == '\0') { + filename = NULL; + } else { + filename = malloc(strlen(fn) + 1); + strcpy(filename, fn); + } + + // Pass our cache (outdir_g) to parse_url() + if (parse_url(URL, outdir_g, filename)) { + DEBUG_PRINT("parse_url() failed with error.\n"); + return; // Parse failed, quit the task directly + }; +} |