hinata: initial commit

author: Mole Shang <[email protected]> 2023-07-25 09:27:26 +0800
committer: Mole Shang <[email protected]> 2023-08-05 23:19:46 +0800
commit: ed8f6df90b0c39835198d5b7af4bbd391362f180 (patch)
tree: 907ba31bac854eb5dc8a2781825e24c049b10580 /src/process_url.c
download: hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.gz
hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.bz2
hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.zip
1 files changed, 526 insertions, 0 deletions
diff --git a/src/process_url.c b/src/process_url.c
new file mode 100644
index 0000000..4bfce8d
--- /dev/null
+++ b/src/process_url.c
@@ -0,0 +1,526 @@
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <curl/header.h>
+#include <curl/system.h>
+#include <curl/urlapi.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include "c11threads.h"
+#else
+#include <threads.h>
+#endif
+
+#include "nuklear.h"
+
+#include "constants.h"
+#include "extractors/extractor.h"
+#include "logger.h"
+#include "process_url.h"
+
+/* NOTICE: the global curl_conf pointer will only stay valid during downloading,
+ * otherwise, ALWAYS point it to NULL. */
+static curl_conf_t *curl_conf;
+extern Site_map site_map;
+Options options;
+static queue_t dl_queue;
+
+thrd_t tid[MAX_THREAD];
+mtx_t mtx;
+cnd_t cnd;
+bool corrupted;
+static const char *outdir_g, *referer_g;
+static CURLU *h;
+
+/*NOTE: Use logger(X) (defined as a generic macro) to log errors. */
+static bool logerr_b(CURLcode r) {
+  if (r && !corrupted) {
+    LOG("libcurl", "Error %d: %s\n", r, ERRTOSTRING(r));
+    corrupted = true;
+  }
+  return r;
+}
+
+static bool logerr_h(CURLHcode r) {
+  if (r) {
+    const char *err_str;
+    switch (r) {
+    case CURLHE_BADINDEX:
+      err_str = "header exists but not with this index";
+      break;
+    case CURLHE_MISSING:
+      // Allow no headers
+      err_str = "no such header exists";
+      DEBUG_PRINT("Header Error %d: %s\n", r, err_str);
+      return r;
+      break;
+    case CURLHE_NOHEADERS:
+      err_str = "no headers at all exist (yet)";
+      break;
+    case CURLHE_NOREQUEST:
+      err_str = "no request with this number was used";
+      break;
+    case CURLHE_OUT_OF_MEMORY:
+      err_str = "out of memory while processing";
+      break;
+    case CURLHE_BAD_ARGUMENT:
+      err_str = "a function argument was not okay";
+      break;
+    case CURLHE_NOT_BUILT_IN:
+      err_str = "if API was disabled in the build";
+      break;
+    default:
+      err_str = "unknown error";
+      break;
+    }
+    LOG("libcurl", "Header Error %d: %s\n", r, err_str);
+    corrupted = true;
+  }
+  return r;
+}
+
+static bool logerr_u(CURLUcode r) {
+  switch (r) {
+  case CURLUE_NO_QUERY:
+    // Accept no query
+    DEBUG_PRINT("The URL has no query.\n");
+    break;
+  case 0:
+    break;
+  default:
+    LOG("libcurl", "Parse Error %d: Invalid URL\n", r);
+    break;
+  }
+  return r;
+}
+
+static void curl_easy_setcommonopts(CURL *curl) {
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+  curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1L);
+  curl_easy_setopt(
+      curl, CURLOPT_USERAGENT,
+      "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0");
+  curl_easy_setopt(curl, CURLOPT_REFERER, referer_g);
+  /* enable all supported built-in compressions,
+   * since serveral sites enable gzip encoding */
+  curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+}
+
+static int progress_callback(void *clientp, curl_off_t dltotal,
+                             curl_off_t dlnow, curl_off_t ultotal,
+                             curl_off_t ulnow) {
+  thrd_info_t *ti = (thrd_info_t *)clientp;
+  ti->curl_c->dlnow_per_thrd[ti->no] = dlnow;
+  if (ti->curl_c->total_thrd == 1) {
+    ti->curl_c->dltotal = dltotal;
+  }
+  // Return non-zero to abort download
+  return corrupted;
+}
+
+static size_t write2str(void *ptr, size_t size, size_t nmemb, str_data_t *s) {
+  size_t new_len = s->len + size * nmemb;
+  s->string = realloc(s->string, new_len + 1);
+  memcpy(s->string + s->len, ptr, size * nmemb);
+  s->string[new_len] = '\0';
+  s->len = new_len;
+
+  return size * nmemb;
+}
+
+static int parse_url(const char *URL, const char *outdir, char *fn) {
+  CURLUcode ue = logerr(curl_url_set(h, CURLUPART_URL, URL, 0));
+  if (ue && ue != CURLUE_NO_QUERY) {
+    return 1;
+  }
+  char *domain, *path, *query;
+
+  if (ue == CURLUE_NO_QUERY) {
+    query = NULL;
+  } else {
+    ue = logerr(curl_url_get(h, CURLUPART_QUERY, &query, 0));
+  }
+  ue = curl_url_get(h, CURLUPART_HOST, &domain, 0);
+  if (ue) {
+    return 1;
+  }
+  ue = logerr(curl_url_get(h, CURLUPART_PATH, &path, 0));
+  if (ue) {
+    return 1;
+  }
+
+  DEBUG_PRINT("Domain: %s\n", domain);
+  DEBUG_PRINT("Path: %s\n", path);
+  DEBUG_PRINT("Query: %s\n", query);
+
+  for (unsigned short i = 0; i < site_map.size; i++) {
+    if (!strcmp(domain, site_map.pairs[i].domain)) {
+      append_log("Got site: %s\n", domain);
+      thrd_t t;
+      options.site = site_map.pairs[i].site;
+      options.URL = malloc(strlen(domain) + strlen(path) + 10);
+      sprintf(options.URL, "https://%s%s", domain, path);
+      options.path = malloc(strlen(path) + 1);
+      strcpy(options.path, path);
+      if (query) {
+        options.query = malloc(strlen(query) + 1);
+        strcpy(options.query, query);
+      } else {
+        options.query = calloc(1, sizeof(char));
+      }
+
+      append_log("pagedata URL: %s\n", options.URL);
+
+      thrd_create(&t, extract, &options);
+      thrd_detach(t);
+
+      curl_free(domain);
+      curl_free(path);
+      curl_free(query);
+      return 0;
+    };
+  }
+
+  curl_conf_t *curl_c = malloc(sizeof(curl_conf_t));
+  curl_c->URL = malloc(strlen(URL) + 1);
+  strcpy(curl_c->URL, URL);
+
+  /* filename */
+
+  if (fn == NULL) {
+    const char *patterns_str[1] = {"(?:.+\\/)([^#/?]+)"};
+    str_array_t results = create_str_array(0);
+    const str_array_t patterns = {(char **)patterns_str, 1};
+    regex_match(path, patterns, &results);
+    for (unsigned short i = 0; i < results.n; i++) {
+      if (results.str[i]) {
+        DEBUG_PRINT("[%d] %s\n", i, results.str[i]);
+        sprintf(curl_c->outfn, "%s%s%s", outdir,
+                outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR,
+                results.str[i]);
+      }
+    }
+    free_str_array(&results);
+    if (curl_c->outfn[0] == '\0') {
+      // sprintf(curl_c->outfn, "%s%c%s", outdir, SPLITTER,
+      // "test");
+      LOG("libcurl",
+          "Infer filename failed, please specify a valid filename.\n");
+      curl_free(domain);
+      curl_free(path);
+      curl_free(query);
+      return 1;
+    }
+  } else {
+    sprintf(curl_c->outfn, "%s%s%s", outdir,
+            outdir[strlen(outdir) - 1] == SPLITTER_CHAR ? "" : SPLITTER_STR,
+            fn);
+    free_and_nullify(fn);
+  }
+  DEBUG_PRINT("File will be saved as: %s\n", curl_c->outfn);
+  DEBUG_PRINT("Got regular URL: %s\n", curl_c->URL);
+
+  enqueue(&dl_queue, (void *)curl_c);
+
+  curl_free(domain);
+  curl_free(path);
+  curl_free(query);
+
+  return 0;
+}
+
+static bool get_info(const char *URL, long *psize) {
+  CURL *curl;
+  long resp_code;
+  bool support_range = false;
+  struct curl_header *pch;
+  curl = curl_easy_init();
+  curl_easy_setopt(curl, CURLOPT_URL, URL);
+  curl_easy_setcommonopts(curl);
+  curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
+  CURLcode r = curl_easy_perform(curl);
+  if (logerr(r)) {
+    curl_easy_cleanup(curl);
+    return support_range;
+  }
+  r = curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T,
+                        (curl_off_t *)psize);
+  if (logerr(r)) {
+    curl_easy_cleanup(curl);
+    return support_range;
+  }
+  CURLHcode rh =
+      curl_easy_header(curl, "Accept-Ranges", 0, CURLH_HEADER, -1, &pch);
+  if (logerr(rh) || strcmp(pch->value, "bytes")) {
+    curl_easy_cleanup(curl);
+    return support_range;
+  }
+  char *ct = NULL;
+  r = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &ct);
+  if (logerr(r)) {
+    curl_easy_cleanup(curl);
+    return support_range;
+  }
+
+  support_range = true;
+  curl_easy_cleanup(curl);
+  return support_range;
+}
+
+static int pull_part(void *a) {
+  CURLcode res;
+  thrd_info_t *ti = (thrd_info_t *)a;
+  curl_conf_t *curl_c = ti->curl_c;
+  unsigned char n = ti->no;
+  // Here we need to manually control str_array_t
+  curl_c->partfn.str[n] = malloc(strlen(curl_c->outfn) + 4);
+  sprintf(curl_c->partfn.str[n], "%s.%d", curl_c->outfn, n);
+  DEBUG_PRINT("[THRD %hhu] partfn: %s, range: %s\n", n,
+              get_str_element(&curl_c->partfn, n), ti->range);
+  {
+    curl_c->fplist[n] = fopen(get_str_element(&curl_c->partfn, n), "wb+");
+    CURL *curl;
+
+    curl = curl_easy_init();
+    curl_easy_setopt(curl, CURLOPT_URL, curl_c->URL);
+    curl_easy_setcommonopts(curl);
+    curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 60L);
+    curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 30L);
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, curl_c->fplist[n]);
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+    if (ti->curl_c->total_thrd != 1) {
+      curl_easy_setopt(curl, CURLOPT_RANGE, ti->range);
+    }
+    curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
+    curl_easy_setopt(curl, CURLOPT_XFERINFODATA, ti);
+    res = curl_easy_perform(curl);
+    rewind(curl_c->fplist[n]);
+    append_log("[THRD %hhu] File downloaded.\n", n);
+    curl_easy_cleanup(curl);
+    logerr(res);
+  }
+  mtx_lock(&mtx);
+  curl_c->success_thrd += 1;
+  cnd_signal(&cnd); // Unblocks the waiting cleanup thread. If no threads are
+                    // blocked, does nothing and returns thrd_success.
+
+  mtx_unlock(&mtx);
+  return (int)res;
+}
+
+static int merge_and_cleanup(curl_conf_t *curl_c) {
+  if (corrupted) {
+    append_log("Cancelling...\n");
+  } else {
+    append_log("Merging files...\n");
+  }
+
+  FILE *fop;
+  fop = fopen(curl_c->outfn, "wb");
+  if (fop == NULL) {
+    // User quitted before downloading, return directly
+    return 1;
+  }
+  for (unsigned short i = 0; i < curl_c->total_thrd; i++) {
+    if (!corrupted) {
+      char buffer[1024];
+      size_t bytesRead = 0;
+      while ((bytesRead = fread(buffer, 1, sizeof(buffer), curl_c->fplist[i])) >
+             0) {
+        fwrite(buffer, 1, bytesRead, fop);
+      }
+    }
+    fclose(curl_c->fplist[i]);
+    if (remove(get_str_element(&curl_c->partfn, i)) != 0) {
+      append_log("Error deleting partial file %s\n",
+                 get_str_element(&curl_c->partfn, i));
+    }
+  }
+  fclose(fop);
+
+  if (corrupted) {
+    // Also delete dst file
+    if (remove(curl_c->outfn) != 0) {
+      append_log("Error deleting file %s\n", curl_c->outfn);
+    }
+  }
+  // Reset stat
+  corrupted = false;
+  curl_c->success_thrd = 0;
+  curl_c->total_thrd = 0;
+  free_and_nullify(curl_c->URL);
+
+  return 0;
+}
+
+static int download(curl_conf_t *curl_c) {
+  /* Reset thrd info. */
+  // if (curl_c->success_thrd == curl_c->total_thrd) {
+  curl_c->success_thrd = 0;
+  // }
+
+  CURL *curl;
+  curl_off_t cl = 0L, begin = 0L, end;
+
+  static thrd_info_t thrd_info[MAX_THREAD] = {0};
+
+  bool support_range = get_info(curl_c->URL, &cl);
+  DEBUG_PRINT("Size: %ld bytes.\n", cl);
+  if (support_range && cl > 0L) {
+    curl_c->dltotal = cl;
+    curl_c->total_thrd = (unsigned char)CEIL_DIV(cl, MAX_THREAD_SIZE);
+    if (curl_c->total_thrd > MAX_THREAD) {
+      curl_c->total_thrd = MAX_THREAD;
+    }
+    LOG("libcurl", "Server supports range header, setting threads to %hhu\n",
+        curl_c->total_thrd);
+  } else {
+    LOG("libcurl", "Server doesn't claim range header "
+                   "support, falling back to single thread.\n");
+    curl_c->total_thrd = 1;
+  }
+  curl_off_t size_per_thrd = (cl / curl_c->total_thrd);
+
+  curl_c->partfn = create_str_array(curl_c->total_thrd);
+
+  for (unsigned char i = 0; i < curl_c->total_thrd; i++) {
+    curl_off_t chunk_size;
+    thrd_info[i].no = i;
+    if (i + 1 == curl_c->total_thrd)
+      chunk_size = cl - (curl_c->total_thrd - 1) * size_per_thrd;
+    else
+      chunk_size = size_per_thrd;
+    end = begin + chunk_size - 1;
+    if (curl_c->total_thrd != 1) {
+      sprintf(thrd_info[i].range,
+              "%" CURL_FORMAT_CURL_OFF_T "-%" CURL_FORMAT_CURL_OFF_T, begin,
+              end);
+    }
+    thrd_info[i].curl_c = curl_c;
+    int error = thrd_create(&tid[i], pull_part, &thrd_info[i]);
+    if (error)
+      append_log("Couldn't run thread number %d, errno %d\n", i, error);
+    begin = end + 1;
+  }
+  return 0;
+}
+
+void curl_init(curl_conf_t *curl) {
+  curl_global_init(CURL_GLOBAL_ALL);
+  h = curl_url();
+  dl_queue = create_queue();
+  mtx_init(&mtx, mtx_plain);
+  cnd_init(&cnd);
+}
+
+void curl_cleanup(status_t *stat) {
+  /* We only need to cleanup
+   * the currently active thread. */
+  if (curl_conf) {
+
+    corrupted = true; // In case libcurl is still downloading
+    /* Now Wait for all threads to cancel... */
+    mtx_lock(&mtx);
+    while (curl_conf->success_thrd != curl_conf->total_thrd) {
+      cnd_wait(&cnd, &mtx);
+    }
+    mtx_unlock(&mtx);
+    if (!stat->is_done) {
+      merge_and_cleanup(curl_conf);
+    }
+    mtx_destroy(&mtx);
+    cnd_destroy(&cnd);
+  }
+  free_queue(&dl_queue);
+  curl_url_cleanup(h);
+  curl_global_cleanup();
+}
+
+void poll_status(status_t *stat) {
+  if (!is_empty_queue(&dl_queue) && stat->is_done) {
+    /* extract_done is a flag used to signal that
+     * the extractor process is done. */
+    curl_conf = (curl_conf_t *)dequeue(&dl_queue);
+    if (download(curl_conf)) {
+      // Something went wrong when creating download task
+      DEBUG_PRINT("Creating download task failed.\n");
+    };
+    stat->is_done = false;
+  }
+  if (curl_conf) {
+    curl_conf->dlnow = 0L;
+    for (unsigned char i = 0; i < curl_conf->total_thrd; i++) {
+      curl_conf->dlnow += curl_conf->dlnow_per_thrd[i];
+    }
+    stat->cur = curl_conf->dlnow;
+    stat->total = curl_conf->dltotal;
+    DEBUG_PRINT("success_thrd: %hhu, total_thrd: %hhu, is_done: %s\n",
+                curl_conf->success_thrd, curl_conf->total_thrd,
+                stat->is_done ? "yes" : "no");
+    mtx_lock(&mtx);
+    if (curl_conf->success_thrd == curl_conf->total_thrd &&
+        (curl_conf->total_thrd && !stat->is_done)) {
+      stat->is_done = true;
+      for (unsigned short i = 0; i < curl_conf->total_thrd; i++) {
+        int r;
+        thrd_join(tid[i], &r);
+      }
+      merge_and_cleanup(curl_conf);
+      append_log("Download %s finished.\n", curl_conf->outfn);
+      curl_conf = NULL;
+    }
+    mtx_unlock(&mtx);
+  }
+}
+
+int get(const char *URL, char **pdstr) {
+  CURL *curl = curl_easy_init();
+  str_data_t pagedata = {0};
+  pagedata.string = malloc(1);
+  pagedata.string[0] = '\0';
+  curl_easy_setopt(curl, CURLOPT_URL, URL);
+  curl_easy_setcommonopts(curl);
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write2str);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&pagedata);
+  CURLcode res = logerr(curl_easy_perform(curl));
+  *pdstr = malloc(pagedata.len + 1);
+  strcpy(*pdstr, pagedata.string);
+  curl_easy_cleanup(curl);
+  return res;
+}
+
+/* Add an URL to dl_queue.
+ * - If outdir is NULL or a empty string, reuse the cached outdir_g
+ * - If fn is NULL or a empty string, infer the filename from URL (otherwise
+ * fail and quit')
+ * - If referer is NULL or a empty string, uses NULL */
+void add_url(const char *URL, const char *outdir, const char *fn,
+             const char *referer) {
+  if (outdir && outdir[0] != '\0') {
+    outdir_g = outdir;
+  }
+  referer_g = referer;
+  if (referer && referer[0] == '\0') {
+    referer_g = NULL;
+  }
+  DEBUG_PRINT("referer_g: %s\n", referer_g);
+
+  char *filename;
+  if (fn == NULL || fn[0] == '\0') {
+    filename = NULL;
+  } else {
+    filename = malloc(strlen(fn) + 1);
+    strcpy(filename, fn);
+  }
+
+  // Pass our cache (outdir_g) to parse_url()
+  if (parse_url(URL, outdir_g, filename)) {
+    DEBUG_PRINT("parse_url() failed with error.\n");
+    return; // Parse failed, quit the task directly
+  };
+}
author	Mole Shang <[email protected]>	2023-07-25 09:27:26 +0800
committer	Mole Shang <[email protected]>	2023-08-05 23:19:46 +0800
commit	ed8f6df90b0c39835198d5b7af4bbd391362f180 (patch)
tree	907ba31bac854eb5dc8a2781825e24c049b10580 /src/process_url.c
download	hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.gz hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.tar.bz2 hinata-ed8f6df90b0c39835198d5b7af4bbd391362f180.zip