mirror of
https://git.bwaaa.monster/omnisearch
synced 2026-03-25 17:19:02 +02:00
Surface blocked search engine responses
This commit is contained in:
@@ -155,6 +155,67 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
|
||||
return current_count + 1;
|
||||
}
|
||||
|
||||
static int add_warning_to_collection(const char *engine_name,
|
||||
const char *warning_message,
|
||||
char ****collection, int **inner_counts,
|
||||
int current_count) {
|
||||
char ***new_collection =
|
||||
(char ***)malloc(sizeof(char **) * (current_count + 1));
|
||||
int *new_inner_counts =
|
||||
(int *)malloc(sizeof(int) * (current_count + 1));
|
||||
|
||||
if (!new_collection || !new_inner_counts) {
|
||||
free(new_collection);
|
||||
free(new_inner_counts);
|
||||
return current_count;
|
||||
}
|
||||
|
||||
if (*collection && current_count > 0) {
|
||||
memcpy(new_collection, *collection, sizeof(char **) * current_count);
|
||||
}
|
||||
if (*inner_counts && current_count > 0) {
|
||||
memcpy(new_inner_counts, *inner_counts, sizeof(int) * current_count);
|
||||
}
|
||||
|
||||
free(*collection);
|
||||
free(*inner_counts);
|
||||
|
||||
*collection = new_collection;
|
||||
*inner_counts = new_inner_counts;
|
||||
|
||||
(*collection)[current_count] = (char **)malloc(sizeof(char *) * 2);
|
||||
if (!(*collection)[current_count])
|
||||
return current_count;
|
||||
|
||||
(*collection)[current_count][0] = strdup(engine_name ? engine_name : "");
|
||||
(*collection)[current_count][1] =
|
||||
strdup(warning_message ? warning_message : "");
|
||||
|
||||
if (!(*collection)[current_count][0] || !(*collection)[current_count][1]) {
|
||||
free((*collection)[current_count][0]);
|
||||
free((*collection)[current_count][1]);
|
||||
free((*collection)[current_count]);
|
||||
return current_count;
|
||||
}
|
||||
|
||||
(*inner_counts)[current_count] = 2;
|
||||
return current_count + 1;
|
||||
}
|
||||
|
||||
static const char *warning_message_for_job(const ScrapeJob *job) {
|
||||
switch (job->status) {
|
||||
case SCRAPE_STATUS_FETCH_ERROR:
|
||||
return "request failed before OmniSearch could read search results.";
|
||||
case SCRAPE_STATUS_PARSE_MISMATCH:
|
||||
return "returned search results in a format OmniSearch could not parse.";
|
||||
case SCRAPE_STATUS_BLOCKED:
|
||||
return "returned a captcha or another blocking page instead of search "
|
||||
"results.";
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int results_handler(UrlParams *params) {
|
||||
TemplateContext ctx = new_context();
|
||||
char *raw_query = "";
|
||||
@@ -224,6 +285,8 @@ int results_handler(UrlParams *params) {
|
||||
jobs[i].response.memory = NULL;
|
||||
jobs[i].response.size = 0;
|
||||
jobs[i].response.capacity = 0;
|
||||
jobs[i].http_status = 0;
|
||||
jobs[i].status = SCRAPE_STATUS_PENDING;
|
||||
}
|
||||
|
||||
scrape_engines_parallel(jobs, ENGINE_COUNT);
|
||||
@@ -260,6 +323,44 @@ int results_handler(UrlParams *params) {
|
||||
free(infobox_inner_counts);
|
||||
}
|
||||
|
||||
int warning_count = 0;
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
if (warning_message_for_job(&jobs[i]))
|
||||
warning_count++;
|
||||
}
|
||||
|
||||
if (warning_count > 0) {
|
||||
char ***warning_matrix = NULL;
|
||||
int *warning_inner_counts = NULL;
|
||||
int warning_index = 0;
|
||||
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
const char *warning_message = warning_message_for_job(&jobs[i]);
|
||||
if (!warning_message)
|
||||
continue;
|
||||
|
||||
warning_index = add_warning_to_collection(
|
||||
jobs[i].engine->name, warning_message, &warning_matrix,
|
||||
&warning_inner_counts, warning_index);
|
||||
}
|
||||
|
||||
if (warning_index > 0) {
|
||||
context_set_array_of_arrays(&ctx, "engine_warnings", warning_matrix,
|
||||
warning_index, warning_inner_counts);
|
||||
}
|
||||
|
||||
if (warning_matrix) {
|
||||
for (int i = 0; i < warning_index; i++) {
|
||||
free(warning_matrix[i][0]);
|
||||
free(warning_matrix[i][1]);
|
||||
free(warning_matrix[i]);
|
||||
}
|
||||
free(warning_matrix);
|
||||
}
|
||||
if (warning_inner_counts)
|
||||
free(warning_inner_counts);
|
||||
}
|
||||
|
||||
int total_results = 0;
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
total_results += jobs[i].results_count;
|
||||
@@ -281,6 +382,15 @@ int results_handler(UrlParams *params) {
|
||||
send_response(html);
|
||||
free(html);
|
||||
}
|
||||
for (int i = 0; i < ENGINE_COUNT; i++)
|
||||
free(all_results[i]);
|
||||
if (page == 1) {
|
||||
for (int i = 0; i < HANDLER_COUNT; i++) {
|
||||
if (infobox_data[i].success) {
|
||||
free_infobox(&infobox_data[i].result);
|
||||
}
|
||||
}
|
||||
}
|
||||
free_context(&ctx);
|
||||
return 0;
|
||||
}
|
||||
@@ -368,6 +478,10 @@ int results_handler(UrlParams *params) {
|
||||
send_response(html);
|
||||
free(html);
|
||||
}
|
||||
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
free(all_results[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (page == 1) {
|
||||
|
||||
@@ -6,8 +6,98 @@
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
static int response_contains(const char *response, const char *needle) {
|
||||
return response && needle && strstr(response, needle) != NULL;
|
||||
}
|
||||
|
||||
static int is_startpage_job(const ScrapeJob *job) {
|
||||
return job && job->engine && strcmp(job->engine->name, "Startpage") == 0;
|
||||
}
|
||||
|
||||
static int response_is_startpage_captcha(const ScrapeJob *job,
|
||||
const char *response) {
|
||||
if (!is_startpage_job(job))
|
||||
return 0;
|
||||
|
||||
return response_contains(response, "<title>Startpage Captcha</title>") ||
|
||||
response_contains(response, "Startpage Captcha") ||
|
||||
response_contains(response, "/static-pages-assets/page-data/captcha/");
|
||||
}
|
||||
|
||||
static int response_looks_like_results_page(const ScrapeJob *job,
|
||||
const char *response) {
|
||||
if (!job || !job->engine || !response)
|
||||
return 0;
|
||||
|
||||
if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) {
|
||||
return response_contains(response, "result-link") ||
|
||||
response_contains(response, "result-snippet");
|
||||
}
|
||||
|
||||
if (strcmp(job->engine->name, "Startpage") == 0) {
|
||||
return response_contains(response, "<title>Startpage Search Results</title>") ||
|
||||
response_contains(response, "class=\"w-gl") ||
|
||||
response_contains(response, "data-testid=\"gl-title-link\"");
|
||||
}
|
||||
|
||||
if (strcmp(job->engine->name, "Yahoo") == 0) {
|
||||
return response_contains(response, "algo-sr") ||
|
||||
response_contains(response, "compTitle") ||
|
||||
response_contains(response, "compText");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void classify_job_response(ScrapeJob *job, const char *response,
|
||||
size_t response_size) {
|
||||
job->results_count = 0;
|
||||
|
||||
if (!response || response_size == 0) {
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (response_is_startpage_captcha(job, response)) {
|
||||
job->status = SCRAPE_STATUS_BLOCKED;
|
||||
return;
|
||||
}
|
||||
|
||||
xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
|
||||
if (!doc) {
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
job->results_count =
|
||||
job->engine->parser(job->engine->name, doc, job->out_results,
|
||||
job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
|
||||
if (job->results_count > 0) {
|
||||
job->status = SCRAPE_STATUS_OK;
|
||||
return;
|
||||
}
|
||||
|
||||
if (job->http_status >= 400) {
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (response_looks_like_results_page(job, response)) {
|
||||
job->status = SCRAPE_STATUS_PARSE_MISMATCH;
|
||||
return;
|
||||
}
|
||||
|
||||
job->status = SCRAPE_STATUS_EMPTY;
|
||||
}
|
||||
|
||||
int check_cache_for_job(ScrapeJob *job) {
|
||||
if (get_cache_ttl_search() <= 0)
|
||||
return 0;
|
||||
@@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) {
|
||||
if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
|
||||
&cached_size) == 0 &&
|
||||
cached_data && cached_size > 0) {
|
||||
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
if (doc) {
|
||||
job->results_count = job->engine->parser(
|
||||
job->engine->name, doc, job->out_results, job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
classify_job_response(job, cached_data, cached_size);
|
||||
|
||||
if (job->status == SCRAPE_STATUS_BLOCKED) {
|
||||
free(cached_data);
|
||||
free(key);
|
||||
return 0;
|
||||
}
|
||||
|
||||
free(cached_data);
|
||||
free(key);
|
||||
|
||||
@@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) {
|
||||
void parse_and_cache_response(ScrapeJob *job) {
|
||||
if (job->response.size == 0) {
|
||||
job->results_count = 0;
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key && get_cache_ttl_search() > 0)
|
||||
cache_set(key, job->response.memory, job->response.size);
|
||||
free(key);
|
||||
classify_job_response(job, job->response.memory, job->response.size);
|
||||
|
||||
xmlDocPtr doc = htmlReadMemory(
|
||||
job->response.memory, job->response.size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
|
||||
if (doc) {
|
||||
job->results_count = job->engine->parser(
|
||||
job->engine->name, doc, job->out_results, job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
} else {
|
||||
job->results_count = 0;
|
||||
if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) {
|
||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key && get_cache_ttl_search() > 0)
|
||||
cache_set(key, job->response.memory, job->response.size);
|
||||
free(key);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
|
||||
}
|
||||
|
||||
void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
||||
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status);
|
||||
|
||||
if (msg->data.result == CURLE_OK)
|
||||
parse_and_cache_response(job);
|
||||
else
|
||||
else {
|
||||
job->results_count = 0;
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
}
|
||||
|
||||
cleanup_job_handle(job, handle);
|
||||
}
|
||||
@@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
|
||||
if (job->response.memory)
|
||||
free(job->response.memory);
|
||||
|
||||
job->results_count = 0;
|
||||
job->http_status = 0;
|
||||
job->status = SCRAPE_STATUS_PENDING;
|
||||
|
||||
if (check_cache_for_job(job)) {
|
||||
job->results_count = job->results_count > 0 ? job->results_count : 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *encoded_query = curl_easy_escape(NULL, job->query, 0);
|
||||
if (!encoded_query)
|
||||
if (!encoded_query) {
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *full_url =
|
||||
build_search_url(job->engine->base_url, job->engine->page_param,
|
||||
@@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
|
||||
encoded_query, job->page);
|
||||
free(encoded_query);
|
||||
|
||||
if (!full_url)
|
||||
if (!full_url) {
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
free(full_url);
|
||||
job->status = SCRAPE_STATUS_FETCH_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) {
|
||||
return 0;
|
||||
|
||||
for (int i = 0; i < num_jobs; i++) {
|
||||
if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
|
||||
if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR ||
|
||||
jobs[i].status == SCRAPE_STATUS_BLOCKED)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
@@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
||||
int retries = 0;
|
||||
|
||||
retry:
|
||||
;
|
||||
CURLM *multi_handle = curl_multi_init();
|
||||
if (!multi_handle)
|
||||
return -1;
|
||||
@@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query,
|
||||
.out_results = out_results,
|
||||
.max_results = max_results,
|
||||
.results_count = 0,
|
||||
.page = 1};
|
||||
.page = 1,
|
||||
.http_status = 0,
|
||||
.status = SCRAPE_STATUS_PENDING};
|
||||
|
||||
scrape_engines_parallel(&job, 1);
|
||||
return job.results_count;
|
||||
|
||||
@@ -32,6 +32,15 @@ typedef struct {
|
||||
size_t capacity;
|
||||
} MemoryBuffer;
|
||||
|
||||
typedef enum {
|
||||
SCRAPE_STATUS_PENDING,
|
||||
SCRAPE_STATUS_OK,
|
||||
SCRAPE_STATUS_EMPTY,
|
||||
SCRAPE_STATUS_FETCH_ERROR,
|
||||
SCRAPE_STATUS_PARSE_MISMATCH,
|
||||
SCRAPE_STATUS_BLOCKED,
|
||||
} ScrapeStatus;
|
||||
|
||||
typedef struct {
|
||||
const SearchEngine *engine;
|
||||
char *query;
|
||||
@@ -41,6 +50,8 @@ typedef struct {
|
||||
CURL *handle;
|
||||
MemoryBuffer response;
|
||||
int results_count;
|
||||
long http_status;
|
||||
ScrapeStatus status;
|
||||
} ScrapeJob;
|
||||
|
||||
extern const SearchEngine ENGINE_REGISTRY[];
|
||||
|
||||
@@ -275,6 +275,30 @@ h1 span {
|
||||
.results-container {
|
||||
grid-column:2;
|
||||
}
|
||||
.engine-warning-list {
|
||||
display:flex;
|
||||
flex-direction:column;
|
||||
gap:12px;
|
||||
margin-bottom:24px;
|
||||
}
|
||||
.engine-warning {
|
||||
background:var(--bg-card);
|
||||
border:1px solid var(--border);
|
||||
border-left:4px solid var(--accent);
|
||||
border-radius:12px;
|
||||
padding:14px 16px;
|
||||
}
|
||||
.engine-warning-title {
|
||||
display:block;
|
||||
font-size:0.95rem;
|
||||
font-weight:700;
|
||||
margin-bottom:4px;
|
||||
}
|
||||
.engine-warning-copy {
|
||||
color:var(--text-muted);
|
||||
line-height:1.5;
|
||||
margin:0;
|
||||
}
|
||||
.result {
|
||||
margin-bottom:32px;
|
||||
}
|
||||
@@ -454,6 +478,9 @@ h1 span {
|
||||
.result {
|
||||
margin-bottom:24px;
|
||||
}
|
||||
.engine-warning {
|
||||
padding:12px 14px;
|
||||
}
|
||||
.result a {
|
||||
font-size:1.1rem;
|
||||
word-break:break-word;
|
||||
|
||||
@@ -38,6 +38,21 @@
|
||||
<aside class="sidebar-spacer">
|
||||
</aside>
|
||||
<main class="results-container">
|
||||
{{if exists engine_warnings}}
|
||||
<section class="engine-warning-list">
|
||||
{{for warning in engine_warnings}}
|
||||
<article class="engine-warning">
|
||||
<strong class="engine-warning-title">
|
||||
{{warning[0]}}
|
||||
</strong>
|
||||
<p class="engine-warning-copy">
|
||||
{{warning[1]}}
|
||||
</p>
|
||||
</article>
|
||||
{{endfor}}
|
||||
</section>
|
||||
{{endif}}
|
||||
|
||||
{{for result in results}}
|
||||
<div class="result">
|
||||
<span class="url">
|
||||
|
||||
Reference in New Issue
Block a user