diff --git a/src/Routes/Search.c b/src/Routes/Search.c index b9851d7..61465f1 100644 --- a/src/Routes/Search.c +++ b/src/Routes/Search.c @@ -155,6 +155,67 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection, return current_count + 1; } +static int add_warning_to_collection(const char *engine_name, + const char *warning_message, + char ****collection, int **inner_counts, + int current_count) { + char ***new_collection = + (char ***)malloc(sizeof(char **) * (current_count + 1)); + int *new_inner_counts = + (int *)malloc(sizeof(int) * (current_count + 1)); + + if (!new_collection || !new_inner_counts) { + free(new_collection); + free(new_inner_counts); + return current_count; + } + + if (*collection && current_count > 0) { + memcpy(new_collection, *collection, sizeof(char **) * current_count); + } + if (*inner_counts && current_count > 0) { + memcpy(new_inner_counts, *inner_counts, sizeof(int) * current_count); + } + + free(*collection); + free(*inner_counts); + + *collection = new_collection; + *inner_counts = new_inner_counts; + + (*collection)[current_count] = (char **)malloc(sizeof(char *) * 2); + if (!(*collection)[current_count]) + return current_count; + + (*collection)[current_count][0] = strdup(engine_name ? engine_name : ""); + (*collection)[current_count][1] = + strdup(warning_message ? warning_message : ""); + + if (!(*collection)[current_count][0] || !(*collection)[current_count][1]) { + free((*collection)[current_count][0]); + free((*collection)[current_count][1]); + free((*collection)[current_count]); + return current_count; + } + + (*inner_counts)[current_count] = 2; + return current_count + 1; +} + +static const char *warning_message_for_job(const ScrapeJob *job) { + switch (job->status) { + case SCRAPE_STATUS_FETCH_ERROR: + return "request failed before OmniSearch could read search results."; + case SCRAPE_STATUS_PARSE_MISMATCH: + return "returned search results in a format OmniSearch could not parse."; + case SCRAPE_STATUS_BLOCKED: + return "returned a captcha or another blocking page instead of search " + "results."; + default: + return NULL; + } +} + int results_handler(UrlParams *params) { TemplateContext ctx = new_context(); char *raw_query = ""; @@ -224,6 +285,8 @@ int results_handler(UrlParams *params) { jobs[i].response.memory = NULL; jobs[i].response.size = 0; jobs[i].response.capacity = 0; + jobs[i].http_status = 0; + jobs[i].status = SCRAPE_STATUS_PENDING; } scrape_engines_parallel(jobs, ENGINE_COUNT); @@ -260,6 +323,44 @@ int results_handler(UrlParams *params) { free(infobox_inner_counts); } + int warning_count = 0; + for (int i = 0; i < ENGINE_COUNT; i++) { + if (warning_message_for_job(&jobs[i])) + warning_count++; + } + + if (warning_count > 0) { + char ***warning_matrix = NULL; + int *warning_inner_counts = NULL; + int warning_index = 0; + + for (int i = 0; i < ENGINE_COUNT; i++) { + const char *warning_message = warning_message_for_job(&jobs[i]); + if (!warning_message) + continue; + + warning_index = add_warning_to_collection( + jobs[i].engine->name, warning_message, &warning_matrix, + &warning_inner_counts, warning_index); + } + + if (warning_index > 0) { + context_set_array_of_arrays(&ctx, "engine_warnings", warning_matrix, + warning_index, warning_inner_counts); + } + + if (warning_matrix) { + for (int i = 0; i < warning_index; i++) { + free(warning_matrix[i][0]); + free(warning_matrix[i][1]); + free(warning_matrix[i]); + } + free(warning_matrix); + } + if (warning_inner_counts) + free(warning_inner_counts); + } + int total_results = 0; for (int i = 0; i < ENGINE_COUNT; i++) { total_results += jobs[i].results_count; @@ -281,6 +382,15 @@ int results_handler(UrlParams *params) { send_response(html); free(html); } + for (int i = 0; i < ENGINE_COUNT; i++) + free(all_results[i]); + if (page == 1) { + for (int i = 0; i < HANDLER_COUNT; i++) { + if (infobox_data[i].success) { + free_infobox(&infobox_data[i].result); + } + } + } free_context(&ctx); return 0; } @@ -368,6 +478,10 @@ int results_handler(UrlParams *params) { send_response(html); free(html); } + + for (int i = 0; i < ENGINE_COUNT; i++) { + free(all_results[i]); + } } if (page == 1) { diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 0709de4..ff8dec8 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -6,8 +6,98 @@ #include #include #include +#include #include +static int response_contains(const char *response, const char *needle) { + return response && needle && strstr(response, needle) != NULL; +} + +static int is_startpage_job(const ScrapeJob *job) { + return job && job->engine && strcmp(job->engine->name, "Startpage") == 0; +} + +static int response_is_startpage_captcha(const ScrapeJob *job, + const char *response) { + if (!is_startpage_job(job)) + return 0; + + return response_contains(response, "Startpage Captcha") || + response_contains(response, "Startpage Captcha") || + response_contains(response, "/static-pages-assets/page-data/captcha/"); +} + +static int response_looks_like_results_page(const ScrapeJob *job, + const char *response) { + if (!job || !job->engine || !response) + return 0; + + if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) { + return response_contains(response, "result-link") || + response_contains(response, "result-snippet"); + } + + if (strcmp(job->engine->name, "Startpage") == 0) { + return response_contains(response, "Startpage Search Results") || + response_contains(response, "class=\"w-gl") || + response_contains(response, "data-testid=\"gl-title-link\""); + } + + if (strcmp(job->engine->name, "Yahoo") == 0) { + return response_contains(response, "algo-sr") || + response_contains(response, "compTitle") || + response_contains(response, "compText"); + } + + return 0; +} + +static void classify_job_response(ScrapeJob *job, const char *response, + size_t response_size) { + job->results_count = 0; + + if (!response || response_size == 0) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + if (response_is_startpage_captcha(job, response)) { + job->status = SCRAPE_STATUS_BLOCKED; + return; + } + + xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + + if (!doc) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + job->results_count = + job->engine->parser(job->engine->name, doc, job->out_results, + job->max_results); + xmlFreeDoc(doc); + + if (job->results_count > 0) { + job->status = SCRAPE_STATUS_OK; + return; + } + + if (job->http_status >= 400) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + if (response_looks_like_results_page(job, response)) { + job->status = SCRAPE_STATUS_PARSE_MISMATCH; + return; + } + + job->status = SCRAPE_STATUS_EMPTY; +} + int check_cache_for_job(ScrapeJob *job) { if (get_cache_ttl_search() <= 0) return 0; @@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) { if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data, &cached_size) == 0 && cached_data && cached_size > 0) { - xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | - HTML_PARSE_NOWARNING); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); + classify_job_response(job, cached_data, cached_size); + + if (job->status == SCRAPE_STATUS_BLOCKED) { + free(cached_data); + free(key); + return 0; } + free(cached_data); free(key); @@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) { void parse_and_cache_response(ScrapeJob *job) { if (job->response.size == 0) { job->results_count = 0; + job->status = SCRAPE_STATUS_FETCH_ERROR; return; } - char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (key && get_cache_ttl_search() > 0) - cache_set(key, job->response.memory, job->response.size); - free(key); + classify_job_response(job, job->response.memory, job->response.size); - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); - - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } else { - job->results_count = 0; + if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) { + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) + cache_set(key, job->response.memory, job->response.size); + free(key); } } @@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) { } void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status); + if (msg->data.result == CURLE_OK) parse_and_cache_response(job); - else + else { job->results_count = 0; + job->status = SCRAPE_STATUS_FETCH_ERROR; + } cleanup_job_handle(job, handle); } @@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) { if (job->response.memory) free(job->response.memory); + job->results_count = 0; + job->http_status = 0; + job->status = SCRAPE_STATUS_PENDING; + if (check_cache_for_job(job)) { job->results_count = job->results_count > 0 ? job->results_count : 0; return 0; } char *encoded_query = curl_easy_escape(NULL, job->query, 0); - if (!encoded_query) + if (!encoded_query) { + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; + } char *full_url = build_search_url(job->engine->base_url, job->engine->page_param, @@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) { encoded_query, job->page); free(encoded_query); - if (!full_url) + if (!full_url) { + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; + } job->handle = curl_easy_init(); if (!job->handle) { free(full_url); + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; } @@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) { return 0; for (int i = 0; i < num_jobs; i++) { - if (jobs[i].results_count == 0 && jobs[i].response.size == 0) + if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR || + jobs[i].status == SCRAPE_STATUS_BLOCKED) return 1; } return 0; @@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { int retries = 0; retry: + ; CURLM *multi_handle = curl_multi_init(); if (!multi_handle) return -1; @@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query, .out_results = out_results, .max_results = max_results, .results_count = 0, - .page = 1}; + .page = 1, + .http_status = 0, + .status = SCRAPE_STATUS_PENDING}; scrape_engines_parallel(&job, 1); return job.results_count; diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index 1439118..014285f 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -32,6 +32,15 @@ typedef struct { size_t capacity; } MemoryBuffer; +typedef enum { + SCRAPE_STATUS_PENDING, + SCRAPE_STATUS_OK, + SCRAPE_STATUS_EMPTY, + SCRAPE_STATUS_FETCH_ERROR, + SCRAPE_STATUS_PARSE_MISMATCH, + SCRAPE_STATUS_BLOCKED, +} ScrapeStatus; + typedef struct { const SearchEngine *engine; char *query; @@ -41,6 +50,8 @@ typedef struct { CURL *handle; MemoryBuffer response; int results_count; + long http_status; + ScrapeStatus status; } ScrapeJob; extern const SearchEngine ENGINE_REGISTRY[]; diff --git a/static/main.css b/static/main.css index a458541..3c71c92 100644 --- a/static/main.css +++ b/static/main.css @@ -275,6 +275,30 @@ h1 span { .results-container { grid-column:2; } +.engine-warning-list { + display:flex; + flex-direction:column; + gap:12px; + margin-bottom:24px; +} +.engine-warning { + background:var(--bg-card); + border:1px solid var(--border); + border-left:4px solid var(--accent); + border-radius:12px; + padding:14px 16px; +} +.engine-warning-title { + display:block; + font-size:0.95rem; + font-weight:700; + margin-bottom:4px; +} +.engine-warning-copy { + color:var(--text-muted); + line-height:1.5; + margin:0; +} .result { margin-bottom:32px; } @@ -454,6 +478,9 @@ h1 span { .result { margin-bottom:24px; } + .engine-warning { + padding:12px 14px; + } .result a { font-size:1.1rem; word-break:break-word; diff --git a/templates/results.html b/templates/results.html index 1bda9e4..851ea1f 100644 --- a/templates/results.html +++ b/templates/results.html @@ -38,6 +38,21 @@
+ {{if exists engine_warnings}} +
+ {{for warning in engine_warnings}} +
+ + {{warning[0]}} + +

+ {{warning[1]}} +

+
+ {{endfor}} +
+ {{endif}} + {{for result in results}}