Surface blocked search engine responses

This commit is contained in:
Else
2026-03-18 15:13:18 +01:00
committed by frosty
parent efb9f737fa
commit 44b6a9b760
5 changed files with 293 additions and 26 deletions

View File

@@ -155,6 +155,67 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
return current_count + 1; return current_count + 1;
} }
static int add_warning_to_collection(const char *engine_name,
const char *warning_message,
char ****collection, int **inner_counts,
int current_count) {
char ***new_collection =
(char ***)malloc(sizeof(char **) * (current_count + 1));
int *new_inner_counts =
(int *)malloc(sizeof(int) * (current_count + 1));
if (!new_collection || !new_inner_counts) {
free(new_collection);
free(new_inner_counts);
return current_count;
}
if (*collection && current_count > 0) {
memcpy(new_collection, *collection, sizeof(char **) * current_count);
}
if (*inner_counts && current_count > 0) {
memcpy(new_inner_counts, *inner_counts, sizeof(int) * current_count);
}
free(*collection);
free(*inner_counts);
*collection = new_collection;
*inner_counts = new_inner_counts;
(*collection)[current_count] = (char **)malloc(sizeof(char *) * 2);
if (!(*collection)[current_count])
return current_count;
(*collection)[current_count][0] = strdup(engine_name ? engine_name : "");
(*collection)[current_count][1] =
strdup(warning_message ? warning_message : "");
if (!(*collection)[current_count][0] || !(*collection)[current_count][1]) {
free((*collection)[current_count][0]);
free((*collection)[current_count][1]);
free((*collection)[current_count]);
return current_count;
}
(*inner_counts)[current_count] = 2;
return current_count + 1;
}
static const char *warning_message_for_job(const ScrapeJob *job) {
switch (job->status) {
case SCRAPE_STATUS_FETCH_ERROR:
return "request failed before OmniSearch could read search results.";
case SCRAPE_STATUS_PARSE_MISMATCH:
return "returned search results in a format OmniSearch could not parse.";
case SCRAPE_STATUS_BLOCKED:
return "returned a captcha or another blocking page instead of search "
"results.";
default:
return NULL;
}
}
int results_handler(UrlParams *params) { int results_handler(UrlParams *params) {
TemplateContext ctx = new_context(); TemplateContext ctx = new_context();
char *raw_query = ""; char *raw_query = "";
@@ -224,6 +285,8 @@ int results_handler(UrlParams *params) {
jobs[i].response.memory = NULL; jobs[i].response.memory = NULL;
jobs[i].response.size = 0; jobs[i].response.size = 0;
jobs[i].response.capacity = 0; jobs[i].response.capacity = 0;
jobs[i].http_status = 0;
jobs[i].status = SCRAPE_STATUS_PENDING;
} }
scrape_engines_parallel(jobs, ENGINE_COUNT); scrape_engines_parallel(jobs, ENGINE_COUNT);
@@ -260,6 +323,44 @@ int results_handler(UrlParams *params) {
free(infobox_inner_counts); free(infobox_inner_counts);
} }
int warning_count = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
if (warning_message_for_job(&jobs[i]))
warning_count++;
}
if (warning_count > 0) {
char ***warning_matrix = NULL;
int *warning_inner_counts = NULL;
int warning_index = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
const char *warning_message = warning_message_for_job(&jobs[i]);
if (!warning_message)
continue;
warning_index = add_warning_to_collection(
jobs[i].engine->name, warning_message, &warning_matrix,
&warning_inner_counts, warning_index);
}
if (warning_index > 0) {
context_set_array_of_arrays(&ctx, "engine_warnings", warning_matrix,
warning_index, warning_inner_counts);
}
if (warning_matrix) {
for (int i = 0; i < warning_index; i++) {
free(warning_matrix[i][0]);
free(warning_matrix[i][1]);
free(warning_matrix[i]);
}
free(warning_matrix);
}
if (warning_inner_counts)
free(warning_inner_counts);
}
int total_results = 0; int total_results = 0;
for (int i = 0; i < ENGINE_COUNT; i++) { for (int i = 0; i < ENGINE_COUNT; i++) {
total_results += jobs[i].results_count; total_results += jobs[i].results_count;
@@ -281,6 +382,15 @@ int results_handler(UrlParams *params) {
send_response(html); send_response(html);
free(html); free(html);
} }
for (int i = 0; i < ENGINE_COUNT; i++)
free(all_results[i]);
if (page == 1) {
for (int i = 0; i < HANDLER_COUNT; i++) {
if (infobox_data[i].success) {
free_infobox(&infobox_data[i].result);
}
}
}
free_context(&ctx); free_context(&ctx);
return 0; return 0;
} }
@@ -368,6 +478,10 @@ int results_handler(UrlParams *params) {
send_response(html); send_response(html);
free(html); free(html);
} }
for (int i = 0; i < ENGINE_COUNT; i++) {
free(all_results[i]);
}
} }
if (page == 1) { if (page == 1) {

View File

@@ -6,8 +6,98 @@
#include <libxml/HTMLparser.h> #include <libxml/HTMLparser.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <time.h> #include <time.h>
static int response_contains(const char *response, const char *needle) {
return response && needle && strstr(response, needle) != NULL;
}
static int is_startpage_job(const ScrapeJob *job) {
return job && job->engine && strcmp(job->engine->name, "Startpage") == 0;
}
static int response_is_startpage_captcha(const ScrapeJob *job,
const char *response) {
if (!is_startpage_job(job))
return 0;
return response_contains(response, "<title>Startpage Captcha</title>") ||
response_contains(response, "Startpage Captcha") ||
response_contains(response, "/static-pages-assets/page-data/captcha/");
}
static int response_looks_like_results_page(const ScrapeJob *job,
const char *response) {
if (!job || !job->engine || !response)
return 0;
if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) {
return response_contains(response, "result-link") ||
response_contains(response, "result-snippet");
}
if (strcmp(job->engine->name, "Startpage") == 0) {
return response_contains(response, "<title>Startpage Search Results</title>") ||
response_contains(response, "class=\"w-gl") ||
response_contains(response, "data-testid=\"gl-title-link\"");
}
if (strcmp(job->engine->name, "Yahoo") == 0) {
return response_contains(response, "algo-sr") ||
response_contains(response, "compTitle") ||
response_contains(response, "compText");
}
return 0;
}
static void classify_job_response(ScrapeJob *job, const char *response,
size_t response_size) {
job->results_count = 0;
if (!response || response_size == 0) {
job->status = SCRAPE_STATUS_FETCH_ERROR;
return;
}
if (response_is_startpage_captcha(job, response)) {
job->status = SCRAPE_STATUS_BLOCKED;
return;
}
xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);
if (!doc) {
job->status = SCRAPE_STATUS_FETCH_ERROR;
return;
}
job->results_count =
job->engine->parser(job->engine->name, doc, job->out_results,
job->max_results);
xmlFreeDoc(doc);
if (job->results_count > 0) {
job->status = SCRAPE_STATUS_OK;
return;
}
if (job->http_status >= 400) {
job->status = SCRAPE_STATUS_FETCH_ERROR;
return;
}
if (response_looks_like_results_page(job, response)) {
job->status = SCRAPE_STATUS_PARSE_MISMATCH;
return;
}
job->status = SCRAPE_STATUS_EMPTY;
}
int check_cache_for_job(ScrapeJob *job) { int check_cache_for_job(ScrapeJob *job) {
if (get_cache_ttl_search() <= 0) if (get_cache_ttl_search() <= 0)
return 0; return 0;
@@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) {
if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data, if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
&cached_size) == 0 && &cached_size) == 0 &&
cached_data && cached_size > 0) { cached_data && cached_size > 0) {
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, classify_job_response(job, cached_data, cached_size);
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING); if (job->status == SCRAPE_STATUS_BLOCKED) {
if (doc) { free(cached_data);
job->results_count = job->engine->parser( free(key);
job->engine->name, doc, job->out_results, job->max_results); return 0;
xmlFreeDoc(doc);
} }
free(cached_data); free(cached_data);
free(key); free(key);
@@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) {
void parse_and_cache_response(ScrapeJob *job) { void parse_and_cache_response(ScrapeJob *job) {
if (job->response.size == 0) { if (job->response.size == 0) {
job->results_count = 0; job->results_count = 0;
job->status = SCRAPE_STATUS_FETCH_ERROR;
return; return;
} }
char *key = cache_compute_key(job->query, job->page, job->engine->name); classify_job_response(job, job->response.memory, job->response.size);
if (key && get_cache_ttl_search() > 0)
cache_set(key, job->response.memory, job->response.size);
free(key);
xmlDocPtr doc = htmlReadMemory( if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) {
job->response.memory, job->response.size, NULL, NULL, char *key = cache_compute_key(job->query, job->page, job->engine->name);
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (key && get_cache_ttl_search() > 0)
cache_set(key, job->response.memory, job->response.size);
if (doc) { free(key);
job->results_count = job->engine->parser(
job->engine->name, doc, job->out_results, job->max_results);
xmlFreeDoc(doc);
} else {
job->results_count = 0;
} }
} }
@@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
} }
void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status);
if (msg->data.result == CURLE_OK) if (msg->data.result == CURLE_OK)
parse_and_cache_response(job); parse_and_cache_response(job);
else else {
job->results_count = 0; job->results_count = 0;
job->status = SCRAPE_STATUS_FETCH_ERROR;
}
cleanup_job_handle(job, handle); cleanup_job_handle(job, handle);
} }
@@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
if (job->response.memory) if (job->response.memory)
free(job->response.memory); free(job->response.memory);
job->results_count = 0;
job->http_status = 0;
job->status = SCRAPE_STATUS_PENDING;
if (check_cache_for_job(job)) { if (check_cache_for_job(job)) {
job->results_count = job->results_count > 0 ? job->results_count : 0; job->results_count = job->results_count > 0 ? job->results_count : 0;
return 0; return 0;
} }
char *encoded_query = curl_easy_escape(NULL, job->query, 0); char *encoded_query = curl_easy_escape(NULL, job->query, 0);
if (!encoded_query) if (!encoded_query) {
job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1; return -1;
}
char *full_url = char *full_url =
build_search_url(job->engine->base_url, job->engine->page_param, build_search_url(job->engine->base_url, job->engine->page_param,
@@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
encoded_query, job->page); encoded_query, job->page);
free(encoded_query); free(encoded_query);
if (!full_url) if (!full_url) {
job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1; return -1;
}
job->handle = curl_easy_init(); job->handle = curl_easy_init();
if (!job->handle) { if (!job->handle) {
free(full_url); free(full_url);
job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1; return -1;
} }
@@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) {
return 0; return 0;
for (int i = 0; i < num_jobs; i++) { for (int i = 0; i < num_jobs; i++) {
if (jobs[i].results_count == 0 && jobs[i].response.size == 0) if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR ||
jobs[i].status == SCRAPE_STATUS_BLOCKED)
return 1; return 1;
} }
return 0; return 0;
@@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
int retries = 0; int retries = 0;
retry: retry:
;
CURLM *multi_handle = curl_multi_init(); CURLM *multi_handle = curl_multi_init();
if (!multi_handle) if (!multi_handle)
return -1; return -1;
@@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query,
.out_results = out_results, .out_results = out_results,
.max_results = max_results, .max_results = max_results,
.results_count = 0, .results_count = 0,
.page = 1}; .page = 1,
.http_status = 0,
.status = SCRAPE_STATUS_PENDING};
scrape_engines_parallel(&job, 1); scrape_engines_parallel(&job, 1);
return job.results_count; return job.results_count;

View File

@@ -32,6 +32,15 @@ typedef struct {
size_t capacity; size_t capacity;
} MemoryBuffer; } MemoryBuffer;
typedef enum {
SCRAPE_STATUS_PENDING,
SCRAPE_STATUS_OK,
SCRAPE_STATUS_EMPTY,
SCRAPE_STATUS_FETCH_ERROR,
SCRAPE_STATUS_PARSE_MISMATCH,
SCRAPE_STATUS_BLOCKED,
} ScrapeStatus;
typedef struct { typedef struct {
const SearchEngine *engine; const SearchEngine *engine;
char *query; char *query;
@@ -41,6 +50,8 @@ typedef struct {
CURL *handle; CURL *handle;
MemoryBuffer response; MemoryBuffer response;
int results_count; int results_count;
long http_status;
ScrapeStatus status;
} ScrapeJob; } ScrapeJob;
extern const SearchEngine ENGINE_REGISTRY[]; extern const SearchEngine ENGINE_REGISTRY[];

View File

@@ -275,6 +275,30 @@ h1 span {
.results-container { .results-container {
grid-column:2; grid-column:2;
} }
.engine-warning-list {
display:flex;
flex-direction:column;
gap:12px;
margin-bottom:24px;
}
.engine-warning {
background:var(--bg-card);
border:1px solid var(--border);
border-left:4px solid var(--accent);
border-radius:12px;
padding:14px 16px;
}
.engine-warning-title {
display:block;
font-size:0.95rem;
font-weight:700;
margin-bottom:4px;
}
.engine-warning-copy {
color:var(--text-muted);
line-height:1.5;
margin:0;
}
.result { .result {
margin-bottom:32px; margin-bottom:32px;
} }
@@ -454,6 +478,9 @@ h1 span {
.result { .result {
margin-bottom:24px; margin-bottom:24px;
} }
.engine-warning {
padding:12px 14px;
}
.result a { .result a {
font-size:1.1rem; font-size:1.1rem;
word-break:break-word; word-break:break-word;

View File

@@ -38,6 +38,21 @@
<aside class="sidebar-spacer"> <aside class="sidebar-spacer">
</aside> </aside>
<main class="results-container"> <main class="results-container">
{{if exists engine_warnings}}
<section class="engine-warning-list">
{{for warning in engine_warnings}}
<article class="engine-warning">
<strong class="engine-warning-title">
{{warning[0]}}
</strong>
<p class="engine-warning-copy">
{{warning[1]}}
</p>
</article>
{{endfor}}
</section>
{{endif}}
{{for result in results}} {{for result in results}}
<div class="result"> <div class="result">
<span class="url"> <span class="url">