mirror of
https://git.bwaaa.monster/omnisearch
synced 2026-03-25 17:19:02 +02:00
fix: refactored scraping components
This commit is contained in:
@@ -23,6 +23,13 @@
|
|||||||
#define INFOBOX_FIELD_COUNT 4
|
#define INFOBOX_FIELD_COUNT 4
|
||||||
#define MAX_RESULTS_PER_ENGINE 10
|
#define MAX_RESULTS_PER_ENGINE 10
|
||||||
|
|
||||||
|
#define CURL_TIMEOUT_SECS 15L
|
||||||
|
#define CURL_DNS_TIMEOUT_SECS 300L
|
||||||
|
|
||||||
|
#define BING_IMAGE_URL "https://www.bing.com/images/search"
|
||||||
|
#define IMAGE_RESULTS_PER_PAGE 32
|
||||||
|
#define IMAGE_RESULT_FIELDS 4
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char host[256];
|
char host[256];
|
||||||
int port;
|
int port;
|
||||||
|
|||||||
@@ -1,15 +1,7 @@
|
|||||||
#include "Images.h"
|
#include "Images.h"
|
||||||
#include "../Scraping/Scraping.h"
|
#include "../Scraping/ImageScraping.h"
|
||||||
#include "../Utility/HttpClient.h"
|
|
||||||
#include "../Utility/Unescape.h"
|
#include "../Utility/Unescape.h"
|
||||||
#include "../Utility/XmlHelper.h"
|
#include "Config.h"
|
||||||
|
|
||||||
#include <curl/curl.h>
|
|
||||||
#include <libxml/HTMLparser.h>
|
|
||||||
#include <libxml/xpath.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
int images_handler(UrlParams *params) {
|
int images_handler(UrlParams *params) {
|
||||||
TemplateContext ctx = new_context();
|
TemplateContext ctx = new_context();
|
||||||
@@ -28,12 +20,12 @@ int images_handler(UrlParams *params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
context_set(&ctx, "query", raw_query);
|
|
||||||
|
|
||||||
char page_str[16], prev_str[16], next_str[16];
|
char page_str[16], prev_str[16], next_str[16];
|
||||||
snprintf(page_str, sizeof(page_str), "%d", page);
|
snprintf(page_str, sizeof(page_str), "%d", page);
|
||||||
snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0);
|
snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0);
|
||||||
snprintf(next_str, sizeof(next_str), "%d", page + 1);
|
snprintf(next_str, sizeof(next_str), "%d", page + 1);
|
||||||
|
|
||||||
|
context_set(&ctx, "query", raw_query);
|
||||||
context_set(&ctx, "page", page_str);
|
context_set(&ctx, "page", page_str);
|
||||||
context_set(&ctx, "prev_page", prev_str);
|
context_set(&ctx, "prev_page", prev_str);
|
||||||
context_set(&ctx, "next_page", next_str);
|
context_set(&ctx, "next_page", next_str);
|
||||||
@@ -49,208 +41,41 @@ int images_handler(UrlParams *params) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
CURL *tmp = curl_easy_init();
|
ImageResult *results = NULL;
|
||||||
if (!tmp) {
|
int result_count = 0;
|
||||||
send_response("<h1>Error initializing curl</h1>");
|
|
||||||
if (display_query)
|
|
||||||
free(display_query);
|
|
||||||
free_context(&ctx);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
char *encoded_query = curl_easy_escape(tmp, raw_query, 0);
|
|
||||||
curl_easy_cleanup(tmp);
|
|
||||||
|
|
||||||
if (!encoded_query) {
|
if (scrape_images(raw_query, page, &results, &result_count) != 0 ||
|
||||||
send_response("<h1>Error encoding query</h1>");
|
!results) {
|
||||||
if (display_query)
|
|
||||||
free(display_query);
|
|
||||||
free_context(&ctx);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
char url[1024];
|
|
||||||
int first = (page - 1) * 32 + 1;
|
|
||||||
snprintf(url, sizeof(url), "https://www.bing.com/images/search?q=%s&first=%d",
|
|
||||||
encoded_query, first);
|
|
||||||
|
|
||||||
HttpResponse resp = http_get(
|
|
||||||
url,
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
|
||||||
if (!resp.memory) {
|
|
||||||
send_response("<h1>Error fetching images</h1>");
|
send_response("<h1>Error fetching images</h1>");
|
||||||
free(encoded_query);
|
|
||||||
free(display_query);
|
free(display_query);
|
||||||
free_context(&ctx);
|
free_context(&ctx);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL,
|
char ***image_matrix = malloc(sizeof(char **) * result_count);
|
||||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
|
int *inner_counts = malloc(sizeof(int) * result_count);
|
||||||
if (!doc) {
|
|
||||||
http_response_free(&resp);
|
if (!image_matrix || !inner_counts) {
|
||||||
free(encoded_query);
|
if (image_matrix)
|
||||||
|
free(image_matrix);
|
||||||
|
if (inner_counts)
|
||||||
|
free(inner_counts);
|
||||||
|
free_image_results(results, result_count);
|
||||||
free(display_query);
|
free(display_query);
|
||||||
free_context(&ctx);
|
free_context(&ctx);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
for (int i = 0; i < result_count; i++) {
|
||||||
|
image_matrix[i] = malloc(sizeof(char *) * IMAGE_RESULT_FIELDS);
|
||||||
if (!xpathCtx) {
|
image_matrix[i][0] = strdup(results[i].thumbnail_url);
|
||||||
xmlFreeDoc(doc);
|
image_matrix[i][1] = strdup(results[i].title);
|
||||||
http_response_free(&resp);
|
image_matrix[i][2] = strdup(results[i].page_url);
|
||||||
free(encoded_query);
|
image_matrix[i][3] = strdup(results[i].full_url);
|
||||||
free(display_query);
|
inner_counts[i] = IMAGE_RESULT_FIELDS;
|
||||||
free_context(&ctx);
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
xmlXPathObjectPtr xpathObj =
|
context_set_array_of_arrays(&ctx, "images", image_matrix, result_count,
|
||||||
xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
|
|
||||||
|
|
||||||
int image_count = 0;
|
|
||||||
char ***image_matrix = NULL;
|
|
||||||
int *inner_counts = NULL;
|
|
||||||
|
|
||||||
if (xpathObj && xpathObj->nodesetval) {
|
|
||||||
int nodes = xpathObj->nodesetval->nodeNr;
|
|
||||||
|
|
||||||
int max_images = (nodes < 32) ? nodes : 32;
|
|
||||||
image_matrix = malloc(sizeof(char **) * max_images);
|
|
||||||
inner_counts = malloc(sizeof(int) * max_images);
|
|
||||||
if (!image_matrix || !inner_counts) {
|
|
||||||
if (image_matrix) free(image_matrix);
|
|
||||||
if (inner_counts) free(inner_counts);
|
|
||||||
image_matrix = NULL;
|
|
||||||
inner_counts = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < nodes; i++) {
|
|
||||||
if (image_count >= 32)
|
|
||||||
break;
|
|
||||||
|
|
||||||
xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
|
|
||||||
xmlNodePtr img_node = NULL;
|
|
||||||
xmlNodePtr tit_node = NULL;
|
|
||||||
xmlNodePtr des_node = NULL;
|
|
||||||
xmlNodePtr thumb_link = NULL;
|
|
||||||
|
|
||||||
for (xmlNodePtr child = node->children; child; child = child->next) {
|
|
||||||
if (child->type != XML_ELEMENT_NODE)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
|
|
||||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
|
||||||
if (class) {
|
|
||||||
if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
|
|
||||||
thumb_link = child;
|
|
||||||
for (xmlNodePtr thumb_child = child->children; thumb_child;
|
|
||||||
thumb_child = thumb_child->next) {
|
|
||||||
if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *div_class =
|
|
||||||
xmlGetProp(thumb_child, (const xmlChar *)"class");
|
|
||||||
if (div_class &&
|
|
||||||
xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
|
|
||||||
for (xmlNodePtr cico_child = thumb_child->children;
|
|
||||||
cico_child; cico_child = cico_child->next) {
|
|
||||||
if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
|
|
||||||
0) {
|
|
||||||
img_node = cico_child;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (div_class)
|
|
||||||
xmlFree(div_class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
|
|
||||||
tit_node = child;
|
|
||||||
}
|
|
||||||
xmlFree(class);
|
|
||||||
}
|
|
||||||
} else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
|
||||||
if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
|
|
||||||
for (xmlNodePtr meta_child = child->children; meta_child;
|
|
||||||
meta_child = meta_child->next) {
|
|
||||||
if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *div_class =
|
|
||||||
xmlGetProp(meta_child, (const xmlChar *)"class");
|
|
||||||
if (div_class) {
|
|
||||||
if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
|
|
||||||
des_node = meta_child;
|
|
||||||
}
|
|
||||||
xmlFree(div_class);
|
|
||||||
}
|
|
||||||
} else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") ==
|
|
||||||
0) {
|
|
||||||
xmlChar *a_class =
|
|
||||||
xmlGetProp(meta_child, (const xmlChar *)"class");
|
|
||||||
if (a_class &&
|
|
||||||
xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
|
|
||||||
tit_node = meta_child;
|
|
||||||
}
|
|
||||||
if (a_class)
|
|
||||||
xmlFree(a_class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (class)
|
|
||||||
xmlFree(class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
xmlChar *iurl =
|
|
||||||
img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
|
|
||||||
xmlChar *full_url =
|
|
||||||
thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
|
|
||||||
xmlChar *title = des_node
|
|
||||||
? xmlNodeGetContent(des_node)
|
|
||||||
: (tit_node ? xmlNodeGetContent(tit_node) : NULL);
|
|
||||||
xmlChar *rurl =
|
|
||||||
tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
|
|
||||||
|
|
||||||
if (iurl && strlen((char *)iurl) > 0) {
|
|
||||||
char *proxy_url = NULL;
|
|
||||||
CURL *esc_curl = curl_easy_init();
|
|
||||||
if (esc_curl) {
|
|
||||||
char *encoded = curl_easy_escape(esc_curl, (char *)iurl, 0);
|
|
||||||
if (encoded) {
|
|
||||||
size_t proxy_len = strlen("/proxy?url=") + strlen(encoded) + 1;
|
|
||||||
proxy_url = malloc(proxy_len);
|
|
||||||
if (proxy_url) {
|
|
||||||
snprintf(proxy_url, proxy_len, "/proxy?url=%s", encoded);
|
|
||||||
}
|
|
||||||
curl_free(encoded);
|
|
||||||
}
|
|
||||||
curl_easy_cleanup(esc_curl);
|
|
||||||
}
|
|
||||||
|
|
||||||
image_matrix[image_count] = malloc(sizeof(char *) * 4);
|
|
||||||
image_matrix[image_count][0] =
|
|
||||||
proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
|
|
||||||
free(proxy_url);
|
|
||||||
image_matrix[image_count][1] = strdup(title ? (char *)title : "Image");
|
|
||||||
image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#");
|
|
||||||
image_matrix[image_count][3] =
|
|
||||||
strdup(full_url ? (char *)full_url : "#");
|
|
||||||
inner_counts[image_count] = 4;
|
|
||||||
image_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (iurl)
|
|
||||||
xmlFree(iurl);
|
|
||||||
if (title)
|
|
||||||
xmlFree(title);
|
|
||||||
if (rurl)
|
|
||||||
xmlFree(rurl);
|
|
||||||
if (full_url)
|
|
||||||
xmlFree(full_url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
context_set_array_of_arrays(&ctx, "images", image_matrix, image_count,
|
|
||||||
inner_counts);
|
inner_counts);
|
||||||
|
|
||||||
char *rendered = render_template("images.html", &ctx);
|
char *rendered = render_template("images.html", &ctx);
|
||||||
@@ -261,27 +86,15 @@ int images_handler(UrlParams *params) {
|
|||||||
send_response("<h1>Error rendering image results</h1>");
|
send_response("<h1>Error rendering image results</h1>");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (image_matrix) {
|
for (int i = 0; i < result_count; i++) {
|
||||||
for (int i = 0; i < image_count; i++) {
|
for (int j = 0; j < IMAGE_RESULT_FIELDS; j++)
|
||||||
for (int j = 0; j < 4; j++) {
|
free(image_matrix[i][j]);
|
||||||
free(image_matrix[i][j]);
|
free(image_matrix[i]);
|
||||||
}
|
|
||||||
free(image_matrix[i]);
|
|
||||||
}
|
|
||||||
free(image_matrix);
|
|
||||||
}
|
|
||||||
if (inner_counts) {
|
|
||||||
free(inner_counts);
|
|
||||||
}
|
}
|
||||||
|
free(image_matrix);
|
||||||
|
free(inner_counts);
|
||||||
|
|
||||||
if (xpathObj)
|
free_image_results(results, result_count);
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
if (xpathCtx)
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
if (doc)
|
|
||||||
xmlFreeDoc(doc);
|
|
||||||
http_response_free(&resp);
|
|
||||||
curl_free(encoded_query);
|
|
||||||
free(display_query);
|
free(display_query);
|
||||||
free_context(&ctx);
|
free_context(&ctx);
|
||||||
|
|
||||||
|
|||||||
239
src/Scraping/ImageScraping.c
Normal file
239
src/Scraping/ImageScraping.c
Normal file
@@ -0,0 +1,239 @@
|
|||||||
|
#include "ImageScraping.h"
|
||||||
|
#include "../Utility/HttpClient.h"
|
||||||
|
#include "Config.h"
|
||||||
|
#include <libxml/HTMLparser.h>
|
||||||
|
#include <libxml/xpath.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
static char *build_proxy_url(const char *image_url) {
|
||||||
|
if (!image_url)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
char *proxy_url = NULL;
|
||||||
|
CURL *curl = curl_easy_init();
|
||||||
|
if (curl) {
|
||||||
|
char *encoded = curl_easy_escape(curl, (char *)image_url, 0);
|
||||||
|
if (encoded) {
|
||||||
|
size_t len = strlen("/proxy?url=") + strlen(encoded) + 1;
|
||||||
|
proxy_url = malloc(len);
|
||||||
|
if (proxy_url)
|
||||||
|
snprintf(proxy_url, len, "/proxy?url=%s", encoded);
|
||||||
|
curl_free(encoded);
|
||||||
|
}
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
}
|
||||||
|
|
||||||
|
return proxy_url;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int parse_image_node(xmlNodePtr node, ImageResult *result) {
|
||||||
|
xmlNodePtr img_node = NULL;
|
||||||
|
xmlNodePtr tit_node = NULL;
|
||||||
|
xmlNodePtr des_node = NULL;
|
||||||
|
xmlNodePtr thumb_link = NULL;
|
||||||
|
|
||||||
|
for (xmlNodePtr child = node->children; child; child = child->next) {
|
||||||
|
if (child->type != XML_ELEMENT_NODE)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
|
||||||
|
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
||||||
|
if (class) {
|
||||||
|
if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
|
||||||
|
thumb_link = child;
|
||||||
|
for (xmlNodePtr thumb_child = child->children; thumb_child;
|
||||||
|
thumb_child = thumb_child->next) {
|
||||||
|
if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
|
||||||
|
xmlChar *div_class =
|
||||||
|
xmlGetProp(thumb_child, (const xmlChar *)"class");
|
||||||
|
if (div_class &&
|
||||||
|
xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
|
||||||
|
for (xmlNodePtr cico_child = thumb_child->children; cico_child;
|
||||||
|
cico_child = cico_child->next) {
|
||||||
|
if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
|
||||||
|
0) {
|
||||||
|
img_node = cico_child;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (div_class)
|
||||||
|
xmlFree(div_class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
|
||||||
|
tit_node = child;
|
||||||
|
}
|
||||||
|
xmlFree(class);
|
||||||
|
}
|
||||||
|
} else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
|
||||||
|
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
||||||
|
if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
|
||||||
|
for (xmlNodePtr meta_child = child->children; meta_child;
|
||||||
|
meta_child = meta_child->next) {
|
||||||
|
if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
|
||||||
|
xmlChar *div_class =
|
||||||
|
xmlGetProp(meta_child, (const xmlChar *)"class");
|
||||||
|
if (div_class) {
|
||||||
|
if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
|
||||||
|
des_node = meta_child;
|
||||||
|
}
|
||||||
|
xmlFree(div_class);
|
||||||
|
}
|
||||||
|
} else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
|
||||||
|
xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
|
||||||
|
if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
|
||||||
|
tit_node = meta_child;
|
||||||
|
}
|
||||||
|
if (a_class)
|
||||||
|
xmlFree(a_class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (class)
|
||||||
|
xmlFree(class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
xmlChar *iurl =
|
||||||
|
img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
|
||||||
|
xmlChar *full_url =
|
||||||
|
thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
|
||||||
|
xmlChar *title = des_node ? xmlNodeGetContent(des_node)
|
||||||
|
: (tit_node ? xmlNodeGetContent(tit_node) : NULL);
|
||||||
|
xmlChar *rurl =
|
||||||
|
tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
|
||||||
|
|
||||||
|
if (!iurl || strlen((char *)iurl) == 0) {
|
||||||
|
if (iurl)
|
||||||
|
xmlFree(iurl);
|
||||||
|
if (title)
|
||||||
|
xmlFree(title);
|
||||||
|
if (rurl)
|
||||||
|
xmlFree(rurl);
|
||||||
|
if (full_url)
|
||||||
|
xmlFree(full_url);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *proxy_url = build_proxy_url((char *)iurl);
|
||||||
|
result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
|
||||||
|
free(proxy_url);
|
||||||
|
result->title = strdup(title ? (char *)title : "Image");
|
||||||
|
result->page_url = strdup(rurl ? (char *)rurl : "#");
|
||||||
|
result->full_url = strdup(full_url ? (char *)full_url : "#");
|
||||||
|
|
||||||
|
if (iurl)
|
||||||
|
xmlFree(iurl);
|
||||||
|
if (title)
|
||||||
|
xmlFree(title);
|
||||||
|
if (rurl)
|
||||||
|
xmlFree(rurl);
|
||||||
|
if (full_url)
|
||||||
|
xmlFree(full_url);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int scrape_images(const char *query, int page, ImageResult **out_results,
|
||||||
|
int *out_count) {
|
||||||
|
*out_results = NULL;
|
||||||
|
*out_count = 0;
|
||||||
|
|
||||||
|
if (!query || strlen(query) == 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
CURL *tmp = curl_easy_init();
|
||||||
|
if (!tmp)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
char *encoded_query = curl_easy_escape(tmp, query, 0);
|
||||||
|
curl_easy_cleanup(tmp);
|
||||||
|
|
||||||
|
if (!encoded_query)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
char url[BUFFER_SIZE_LARGE];
|
||||||
|
int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
|
||||||
|
snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query,
|
||||||
|
first);
|
||||||
|
free(encoded_query);
|
||||||
|
|
||||||
|
HttpResponse resp = http_get(
|
||||||
|
url,
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
||||||
|
if (!resp.memory) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL,
|
||||||
|
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
|
||||||
|
if (!doc) {
|
||||||
|
http_response_free(&resp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||||
|
if (!xpathCtx) {
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
http_response_free(&resp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
xmlXPathObjectPtr xpathObj =
|
||||||
|
xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
|
||||||
|
|
||||||
|
if (!xpathObj || !xpathObj->nodesetval) {
|
||||||
|
if (xpathObj)
|
||||||
|
xmlXPathFreeObject(xpathObj);
|
||||||
|
xmlXPathFreeContext(xpathCtx);
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
http_response_free(&resp);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int nodes = xpathObj->nodesetval->nodeNr;
|
||||||
|
int max_images =
|
||||||
|
(nodes < IMAGE_RESULTS_PER_PAGE) ? nodes : IMAGE_RESULTS_PER_PAGE;
|
||||||
|
|
||||||
|
ImageResult *results = malloc(sizeof(ImageResult) * max_images);
|
||||||
|
if (!results) {
|
||||||
|
xmlXPathFreeObject(xpathObj);
|
||||||
|
xmlXPathFreeContext(xpathCtx);
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
http_response_free(&resp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
|
||||||
|
xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
|
||||||
|
if (parse_image_node(node, &results[count])) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
xmlXPathFreeObject(xpathObj);
|
||||||
|
xmlXPathFreeContext(xpathCtx);
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
http_response_free(&resp);
|
||||||
|
|
||||||
|
*out_results = results;
|
||||||
|
*out_count = count;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_image_results(ImageResult *results, int count) {
|
||||||
|
if (!results)
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
free(results[i].thumbnail_url);
|
||||||
|
free(results[i].title);
|
||||||
|
free(results[i].page_url);
|
||||||
|
free(results[i].full_url);
|
||||||
|
}
|
||||||
|
free(results);
|
||||||
|
}
|
||||||
18
src/Scraping/ImageScraping.h
Normal file
18
src/Scraping/ImageScraping.h
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#ifndef IMAGESCRAPING_H
|
||||||
|
#define IMAGESCRAPING_H
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
#include <libxml/HTMLparser.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char *thumbnail_url;
|
||||||
|
char *title;
|
||||||
|
char *page_url;
|
||||||
|
char *full_url;
|
||||||
|
} ImageResult;
|
||||||
|
|
||||||
|
int scrape_images(const char *query, int page, ImageResult **out_results,
|
||||||
|
int *out_count);
|
||||||
|
void free_image_results(ImageResult *results, int count);
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -1,395 +1,20 @@
|
|||||||
#include "Scraping.h"
|
#include "Scraping.h"
|
||||||
#include "../Cache/Cache.h"
|
#include "../Cache/Cache.h"
|
||||||
#include "../Proxy/Proxy.h"
|
#include "../Proxy/Proxy.h"
|
||||||
#include "../Utility/Unescape.h"
|
|
||||||
#include "../Utility/XmlHelper.h"
|
|
||||||
#include "Config.h"
|
#include "Config.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
#include <libxml/xpath.h>
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
|
int check_cache_for_job(ScrapeJob *job) {
|
||||||
void *userp) {
|
if (get_cache_ttl_search() <= 0)
|
||||||
size_t realsize = size * nmemb;
|
|
||||||
MemoryBuffer *mem = (MemoryBuffer *)userp;
|
|
||||||
|
|
||||||
if (mem->size + realsize + 1 > mem->capacity) {
|
|
||||||
size_t new_cap =
|
|
||||||
mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
|
|
||||||
while (new_cap < mem->size + realsize + 1)
|
|
||||||
new_cap *= 2;
|
|
||||||
|
|
||||||
char *ptr = (char *)realloc(mem->memory, new_cap);
|
|
||||||
if (!ptr) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
mem->memory = ptr;
|
|
||||||
mem->capacity = new_cap;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
|
||||||
mem->size += realsize;
|
|
||||||
mem->memory[mem->size] = 0;
|
|
||||||
|
|
||||||
return realsize;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char *get_random_user_agent(void) {
|
|
||||||
static const char *agents[] = {
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
|
|
||||||
"like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
|
|
||||||
"Gecko) "
|
|
||||||
"Chrome/120.0.0.0` Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
|
|
||||||
"Firefox/121.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
|
|
||||||
"(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
|
|
||||||
return agents[rand() % 5];
|
|
||||||
}
|
|
||||||
|
|
||||||
static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
|
|
||||||
SearchResult **out_results, int max_results) {
|
|
||||||
(void)engine_name;
|
|
||||||
int found_count = 0;
|
|
||||||
|
|
||||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
||||||
if (!xpathCtx) {
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
|
||||||
|
|
||||||
xmlXPathObjectPtr xpathObj = xml_xpath_eval(
|
|
||||||
xpathCtx, "//tr[not(contains(@class, "
|
|
||||||
"'result-sponsored'))]//a[@class='result-link']");
|
|
||||||
|
|
||||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
||||||
if (xpathObj)
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int num_links = xpathObj->nodesetval->nodeNr;
|
|
||||||
*out_results = xml_result_alloc(num_links, max_results);
|
|
||||||
if (!*out_results) {
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < num_links && found_count < max_results; i++) {
|
|
||||||
xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
|
|
||||||
char *title = xml_node_content(linkNode);
|
|
||||||
char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
|
|
||||||
char *snippet_text = NULL;
|
|
||||||
|
|
||||||
xmlNodePtr current = linkNode->parent;
|
|
||||||
while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
|
|
||||||
current = current->parent;
|
|
||||||
|
|
||||||
if (current && current->next) {
|
|
||||||
xmlNodePtr snippetRow = current->next;
|
|
||||||
while (snippetRow &&
|
|
||||||
xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
|
|
||||||
snippetRow = snippetRow->next;
|
|
||||||
if (snippetRow) {
|
|
||||||
xpathCtx->node = snippetRow;
|
|
||||||
xmlXPathObjectPtr sObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']");
|
|
||||||
if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
|
|
||||||
snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]);
|
|
||||||
}
|
|
||||||
if (sObj)
|
|
||||||
xmlXPathFreeObject(sObj);
|
|
||||||
xpathCtx->node = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(*out_results)[found_count].url = unescape_search_url(url);
|
|
||||||
(*out_results)[found_count].title = strdup(title ? title : "No Title");
|
|
||||||
(*out_results)[found_count].snippet =
|
|
||||||
strdup(snippet_text ? snippet_text : "");
|
|
||||||
found_count++;
|
|
||||||
|
|
||||||
if (title)
|
|
||||||
xmlFree(title);
|
|
||||||
if (url)
|
|
||||||
xmlFree(url);
|
|
||||||
if (snippet_text)
|
|
||||||
xmlFree(snippet_text);
|
|
||||||
}
|
|
||||||
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return found_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int parse_startpage(const char *engine_name, xmlDocPtr doc,
|
|
||||||
SearchResult **out_results, int max_results) {
|
|
||||||
(void)engine_name;
|
|
||||||
int found_count = 0;
|
|
||||||
|
|
||||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
||||||
if (!xpathCtx) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
xmlXPathObjectPtr xpathObj =
|
|
||||||
xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]");
|
|
||||||
|
|
||||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
||||||
if (xpathObj)
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int num_results = xpathObj->nodesetval->nodeNr;
|
|
||||||
*out_results = xml_result_alloc(num_results, max_results);
|
|
||||||
if (!*out_results) {
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
||||||
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
|
||||||
xpathCtx->node = resultNode;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr linkObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]");
|
|
||||||
char *url =
|
|
||||||
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
|
||||||
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
|
||||||
(xmlChar *)"href")
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr titleObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]");
|
|
||||||
char *title =
|
|
||||||
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
|
||||||
? xml_node_content(titleObj->nodesetval->nodeTab[0])
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr snippetObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]");
|
|
||||||
char *snippet_text =
|
|
||||||
(snippetObj && snippetObj->nodesetval &&
|
|
||||||
snippetObj->nodesetval->nodeNr > 0)
|
|
||||||
? xml_node_content(snippetObj->nodesetval->nodeTab[0])
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
if (url && title) {
|
|
||||||
(*out_results)[found_count].url = strdup(url);
|
|
||||||
(*out_results)[found_count].title = strdup(title);
|
|
||||||
(*out_results)[found_count].snippet =
|
|
||||||
strdup(snippet_text ? snippet_text : "");
|
|
||||||
found_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (title)
|
|
||||||
xmlFree(title);
|
|
||||||
if (url)
|
|
||||||
xmlFree(url);
|
|
||||||
if (snippet_text)
|
|
||||||
xmlFree(snippet_text);
|
|
||||||
if (linkObj)
|
|
||||||
xmlXPathFreeObject(linkObj);
|
|
||||||
if (titleObj)
|
|
||||||
xmlXPathFreeObject(titleObj);
|
|
||||||
if (snippetObj)
|
|
||||||
xmlXPathFreeObject(snippetObj);
|
|
||||||
}
|
|
||||||
|
|
||||||
xpathCtx->node = NULL;
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return found_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
|
|
||||||
SearchResult **out_results, int max_results) {
|
|
||||||
(void)engine_name;
|
|
||||||
int found_count = 0;
|
|
||||||
|
|
||||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
|
||||||
if (!xpathCtx) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
xmlXPathObjectPtr xpathObj =
|
|
||||||
xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]");
|
|
||||||
|
|
||||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
|
||||||
if (xpathObj)
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int num_results = xpathObj->nodesetval->nodeNr;
|
|
||||||
*out_results = xml_result_alloc(num_results, max_results);
|
|
||||||
if (!*out_results) {
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
|
||||||
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
|
||||||
xpathCtx->node = resultNode;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr linkObj = xml_xpath_eval(
|
|
||||||
xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
|
|
||||||
char *url =
|
|
||||||
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
|
||||||
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
|
||||||
(xmlChar *)"href")
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr titleObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]");
|
|
||||||
char *title =
|
|
||||||
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
|
||||||
? xml_node_content(titleObj->nodesetval->nodeTab[0])
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
xmlXPathObjectPtr snippetObj =
|
|
||||||
xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p");
|
|
||||||
char *snippet_text =
|
|
||||||
(snippetObj && snippetObj->nodesetval &&
|
|
||||||
snippetObj->nodesetval->nodeNr > 0)
|
|
||||||
? xml_node_content(snippetObj->nodesetval->nodeTab[0])
|
|
||||||
: NULL;
|
|
||||||
|
|
||||||
if (url && title) {
|
|
||||||
(*out_results)[found_count].url = unescape_search_url(url);
|
|
||||||
(*out_results)[found_count].title = strdup(title);
|
|
||||||
(*out_results)[found_count].snippet =
|
|
||||||
strdup(snippet_text ? snippet_text : "");
|
|
||||||
found_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (title)
|
|
||||||
xmlFree(title);
|
|
||||||
if (url)
|
|
||||||
xmlFree(url);
|
|
||||||
if (snippet_text)
|
|
||||||
xmlFree(snippet_text);
|
|
||||||
if (linkObj)
|
|
||||||
xmlXPathFreeObject(linkObj);
|
|
||||||
if (titleObj)
|
|
||||||
xmlXPathFreeObject(titleObj);
|
|
||||||
if (snippetObj)
|
|
||||||
xmlXPathFreeObject(snippetObj);
|
|
||||||
}
|
|
||||||
|
|
||||||
xpathCtx->node = NULL;
|
|
||||||
xmlXPathFreeObject(xpathObj);
|
|
||||||
xmlXPathFreeContext(xpathCtx);
|
|
||||||
return found_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
const SearchEngine ENGINE_REGISTRY[] = {
|
|
||||||
{.name = "DuckDuckGo Lite",
|
|
||||||
.base_url = "https://lite.duckduckgo.com/lite/?q=",
|
|
||||||
.host_header = "lite.duckduckgo.com",
|
|
||||||
.referer = "https://lite.duckduckgo.com/",
|
|
||||||
.page_param = "s",
|
|
||||||
.page_multiplier = 30,
|
|
||||||
.page_base = 0,
|
|
||||||
.parser = parse_ddg_lite},
|
|
||||||
{.name = "Startpage",
|
|
||||||
.base_url = "https://www.startpage.com/sp/search?q=",
|
|
||||||
.host_header = "www.startpage.com",
|
|
||||||
.referer = "https://www.startpage.com/",
|
|
||||||
.page_param = "page",
|
|
||||||
.page_multiplier = 1,
|
|
||||||
.page_base = 1,
|
|
||||||
.parser = parse_startpage},
|
|
||||||
{.name = "Yahoo",
|
|
||||||
.base_url = "https://search.yahoo.com/search?p=",
|
|
||||||
.host_header = "search.yahoo.com",
|
|
||||||
.referer = "https://search.yahoo.com/",
|
|
||||||
.page_param = "b",
|
|
||||||
.page_multiplier = 10,
|
|
||||||
.page_base = 1,
|
|
||||||
.parser = parse_yahoo}};
|
|
||||||
|
|
||||||
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
|
|
||||||
|
|
||||||
#define CURL_TIMEOUT 15L
|
|
||||||
#define CURL_DNS_TIMEOUT 300L
|
|
||||||
|
|
||||||
static void configure_curl_handle(CURL *curl, const char *full_url,
|
|
||||||
MemoryBuffer *chunk,
|
|
||||||
struct curl_slist *headers) {
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, full_url);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
|
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
|
||||||
curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
|
||||||
|
|
||||||
apply_proxy_settings(curl);
|
|
||||||
}
|
|
||||||
|
|
||||||
static char *build_search_url(const char *base_url, const char *page_param,
|
|
||||||
int page_multiplier, int page_base,
|
|
||||||
const char *encoded_query, int page) {
|
|
||||||
int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
|
|
||||||
char *url = malloc(BUFFER_SIZE_LARGE);
|
|
||||||
if (!url) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
|
|
||||||
page_param, page_value);
|
|
||||||
return url;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct curl_slist *build_request_headers(const char *host_header,
|
|
||||||
const char *referer) {
|
|
||||||
struct curl_slist *headers = NULL;
|
|
||||||
char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
|
|
||||||
|
|
||||||
snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
|
|
||||||
snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
|
|
||||||
|
|
||||||
headers = curl_slist_append(headers, host_buf);
|
|
||||||
headers = curl_slist_append(headers, ref_buf);
|
|
||||||
headers = curl_slist_append(
|
|
||||||
headers,
|
|
||||||
"Accept: "
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
||||||
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
|
|
||||||
headers = curl_slist_append(headers, "DNT: 1");
|
|
||||||
|
|
||||||
return headers;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int check_cache_for_job(ScrapeJob *job) {
|
|
||||||
if (get_cache_ttl_search() <= 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||||
if (!key) {
|
if (!key)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
|
||||||
|
|
||||||
char *cached_data = NULL;
|
char *cached_data = NULL;
|
||||||
size_t cached_size = 0;
|
size_t cached_size = 0;
|
||||||
@@ -414,27 +39,31 @@ static int check_cache_for_job(ScrapeJob *job) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
void parse_and_cache_response(ScrapeJob *job) {
|
||||||
if (msg->data.result == CURLE_OK && job->response.size > 0) {
|
if (job->response.size == 0) {
|
||||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
job->results_count = 0;
|
||||||
if (key && get_cache_ttl_search() > 0) {
|
return;
|
||||||
cache_set(key, job->response.memory, job->response.size);
|
}
|
||||||
free(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
xmlDocPtr doc = htmlReadMemory(
|
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||||
job->response.memory, job->response.size, NULL, NULL,
|
if (key && get_cache_ttl_search() > 0)
|
||||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
cache_set(key, job->response.memory, job->response.size);
|
||||||
|
free(key);
|
||||||
|
|
||||||
if (doc) {
|
xmlDocPtr doc = htmlReadMemory(
|
||||||
job->results_count = job->engine->parser(
|
job->response.memory, job->response.size, NULL, NULL,
|
||||||
job->engine->name, doc, job->out_results, job->max_results);
|
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||||
xmlFreeDoc(doc);
|
|
||||||
}
|
if (doc) {
|
||||||
|
job->results_count = job->engine->parser(
|
||||||
|
job->engine->name, doc, job->out_results, job->max_results);
|
||||||
|
xmlFreeDoc(doc);
|
||||||
} else {
|
} else {
|
||||||
job->results_count = 0;
|
job->results_count = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
|
||||||
struct curl_slist *headers = NULL;
|
struct curl_slist *headers = NULL;
|
||||||
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
|
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
|
||||||
if (headers)
|
if (headers)
|
||||||
@@ -444,67 +73,112 @@ static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
|||||||
job->response.memory = NULL;
|
job->response.memory = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
|
||||||
|
if (msg->data.result == CURLE_OK)
|
||||||
|
parse_and_cache_response(job);
|
||||||
|
else
|
||||||
|
job->results_count = 0;
|
||||||
|
|
||||||
|
cleanup_job_handle(job, handle);
|
||||||
|
}
|
||||||
|
|
||||||
|
int setup_job(ScrapeJob *job, CURLM *multi_handle) {
|
||||||
|
if (job->handle)
|
||||||
|
curl_easy_cleanup(job->handle);
|
||||||
|
if (job->response.memory)
|
||||||
|
free(job->response.memory);
|
||||||
|
|
||||||
|
if (check_cache_for_job(job)) {
|
||||||
|
job->results_count = job->results_count > 0 ? job->results_count : 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *encoded_query = curl_easy_escape(NULL, job->query, 0);
|
||||||
|
if (!encoded_query)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
char *full_url =
|
||||||
|
build_search_url(job->engine->base_url, job->engine->page_param,
|
||||||
|
job->engine->page_multiplier, job->engine->page_base,
|
||||||
|
encoded_query, job->page);
|
||||||
|
free(encoded_query);
|
||||||
|
|
||||||
|
if (!full_url)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
job->handle = curl_easy_init();
|
||||||
|
if (!job->handle) {
|
||||||
|
free(full_url);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
|
||||||
|
job->response.size = 0;
|
||||||
|
job->response.capacity = INITIAL_BUFFER_SIZE;
|
||||||
|
|
||||||
|
struct curl_slist *headers =
|
||||||
|
build_request_headers(job->engine->host_header, job->engine->referer);
|
||||||
|
|
||||||
|
configure_curl_handle(job->handle, full_url, &job->response, headers);
|
||||||
|
curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
|
||||||
|
|
||||||
|
free(full_url);
|
||||||
|
curl_multi_add_handle(multi_handle, job->handle);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) {
|
||||||
|
CURLMsg *msg;
|
||||||
|
int msgs_left;
|
||||||
|
|
||||||
|
while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
|
||||||
|
if (msg->msg != CURLMSG_DONE)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
CURL *handle = msg->easy_handle;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
|
if (jobs[i].handle && jobs[i].handle == handle) {
|
||||||
|
process_response(&jobs[i], handle, msg);
|
||||||
|
curl_multi_remove_handle(multi_handle, handle);
|
||||||
|
curl_easy_cleanup(handle);
|
||||||
|
jobs[i].handle = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int should_retry(ScrapeJob *jobs, int num_jobs) {
|
||||||
|
if (proxy_count <= 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
|
if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
||||||
int retries = 0;
|
int retries = 0;
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
CURLM *multi_handle = curl_multi_init();
|
CURLM *multi_handle = curl_multi_init();
|
||||||
if (!multi_handle) {
|
if (!multi_handle)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < num_jobs; i++) {
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
ScrapeJob *job = &jobs[i];
|
if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) {
|
||||||
|
curl_multi_remove_handle(multi_handle, jobs[i].handle);
|
||||||
if (job->handle) {
|
curl_easy_cleanup(jobs[i].handle);
|
||||||
curl_easy_cleanup(job->handle);
|
jobs[i].handle = NULL;
|
||||||
job->handle = NULL;
|
|
||||||
}
|
}
|
||||||
if (job->response.memory) {
|
|
||||||
free(job->response.memory);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (check_cache_for_job(job)) {
|
|
||||||
job->results_count = job->results_count > 0 ? job->results_count : 0;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *encoded_query = curl_easy_escape(NULL, job->query, 0);
|
|
||||||
if (!encoded_query) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *full_url =
|
|
||||||
build_search_url(job->engine->base_url, job->engine->page_param,
|
|
||||||
job->engine->page_multiplier, job->engine->page_base,
|
|
||||||
encoded_query, job->page);
|
|
||||||
free(encoded_query);
|
|
||||||
|
|
||||||
if (!full_url) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
job->handle = curl_easy_init();
|
|
||||||
if (!job->handle) {
|
|
||||||
free(full_url);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
|
|
||||||
job->response.size = 0;
|
|
||||||
job->response.capacity = INITIAL_BUFFER_SIZE;
|
|
||||||
|
|
||||||
struct curl_slist *headers =
|
|
||||||
build_request_headers(job->engine->host_header, job->engine->referer);
|
|
||||||
|
|
||||||
configure_curl_handle(job->handle, full_url, &job->response, headers);
|
|
||||||
curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
|
|
||||||
|
|
||||||
free(full_url);
|
|
||||||
curl_multi_add_handle(multi_handle, job->handle);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
usleep(100000 + (rand() % 100000));
|
http_delay();
|
||||||
|
|
||||||
int still_running = 0;
|
int still_running = 0;
|
||||||
curl_multi_perform(multi_handle, &still_running);
|
curl_multi_perform(multi_handle, &still_running);
|
||||||
@@ -512,50 +186,17 @@ retry:
|
|||||||
do {
|
do {
|
||||||
int numfds = 0;
|
int numfds = 0;
|
||||||
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
||||||
|
if (mc != CURLM_OK)
|
||||||
if (mc != CURLM_OK) {
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
curl_multi_perform(multi_handle, &still_running);
|
curl_multi_perform(multi_handle, &still_running);
|
||||||
} while (still_running);
|
} while (still_running);
|
||||||
|
|
||||||
CURLMsg *msg;
|
handle_responses(multi_handle, jobs, num_jobs);
|
||||||
int msgs_left;
|
|
||||||
while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
|
|
||||||
if (msg->msg == CURLMSG_DONE) {
|
|
||||||
CURL *handle = msg->easy_handle;
|
|
||||||
|
|
||||||
for (int i = 0; i < num_jobs; i++) {
|
|
||||||
if (jobs[i].handle && jobs[i].handle == handle) {
|
|
||||||
ScrapeJob *job = &jobs[i];
|
|
||||||
|
|
||||||
process_job_response(job, handle, msg);
|
|
||||||
|
|
||||||
curl_multi_remove_handle(multi_handle, handle);
|
|
||||||
if (handle)
|
|
||||||
curl_easy_cleanup(handle);
|
|
||||||
job->handle = NULL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_multi_cleanup(multi_handle);
|
curl_multi_cleanup(multi_handle);
|
||||||
|
|
||||||
if (retries < max_proxy_retries && proxy_count > 0) {
|
if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) {
|
||||||
int any_failed = 0;
|
retries++;
|
||||||
for (int i = 0; i < num_jobs; i++) {
|
goto retry;
|
||||||
if (jobs[i].results_count == 0 && jobs[i].response.size == 0) {
|
|
||||||
any_failed = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (any_failed) {
|
|
||||||
retries++;
|
|
||||||
goto retry;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
|
#include <libxml/xpath.h>
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char *url;
|
char *url;
|
||||||
@@ -45,6 +46,25 @@ typedef struct {
|
|||||||
extern const SearchEngine ENGINE_REGISTRY[];
|
extern const SearchEngine ENGINE_REGISTRY[];
|
||||||
extern const int ENGINE_COUNT;
|
extern const int ENGINE_COUNT;
|
||||||
|
|
||||||
|
size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
|
||||||
|
void *userp);
|
||||||
|
const char *get_random_user_agent(void);
|
||||||
|
void configure_curl_handle(CURL *curl, const char *full_url,
|
||||||
|
MemoryBuffer *chunk, struct curl_slist *headers);
|
||||||
|
char *build_search_url(const char *base_url, const char *page_param,
|
||||||
|
int page_multiplier, int page_base,
|
||||||
|
const char *encoded_query, int page);
|
||||||
|
struct curl_slist *build_request_headers(const char *host_header,
|
||||||
|
const char *referer);
|
||||||
|
void http_delay(void);
|
||||||
|
|
||||||
|
xmlXPathContextPtr create_xpath_context(xmlDocPtr doc);
|
||||||
|
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj);
|
||||||
|
SearchResult *alloc_results_array(int capacity, int max_results);
|
||||||
|
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
|
||||||
|
int unescape);
|
||||||
|
void free_xml_node_list(char *title, char *url, char *snippet);
|
||||||
|
|
||||||
int scrape_engine(const SearchEngine *engine, const char *query,
|
int scrape_engine(const SearchEngine *engine, const char *query,
|
||||||
SearchResult **out_results, int max_results);
|
SearchResult **out_results, int max_results);
|
||||||
|
|
||||||
|
|||||||
109
src/Scraping/ScrapingHttp.c
Normal file
109
src/Scraping/ScrapingHttp.c
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
#include "../Proxy/Proxy.h"
|
||||||
|
#include "Config.h"
|
||||||
|
#include "Scraping.h"
|
||||||
|
#include <curl/curl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#define HTTP_DELAY_MIN_US 100000
|
||||||
|
#define HTTP_DELAY_RANGE_US 100000
|
||||||
|
|
||||||
|
static const char *USER_AGENTS[] = {
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
|
||||||
|
"like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
|
||||||
|
"Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
|
||||||
|
"Firefox/121.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
|
||||||
|
|
||||||
|
#define USER_AGENT_COUNT (sizeof(USER_AGENTS) / sizeof(USER_AGENTS[0]))
|
||||||
|
|
||||||
|
size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
|
||||||
|
void *userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
MemoryBuffer *mem = (MemoryBuffer *)userp;
|
||||||
|
|
||||||
|
if (mem->size + realsize + 1 > mem->capacity) {
|
||||||
|
size_t new_cap =
|
||||||
|
mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
|
||||||
|
while (new_cap < mem->size + realsize + 1)
|
||||||
|
new_cap *= 2;
|
||||||
|
|
||||||
|
char *ptr = (char *)realloc(mem->memory, new_cap);
|
||||||
|
if (!ptr)
|
||||||
|
return 0;
|
||||||
|
mem->memory = ptr;
|
||||||
|
mem->capacity = new_cap;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||||
|
mem->size += realsize;
|
||||||
|
mem->memory[mem->size] = 0;
|
||||||
|
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *get_random_user_agent(void) {
|
||||||
|
return USER_AGENTS[rand() % USER_AGENT_COUNT];
|
||||||
|
}
|
||||||
|
|
||||||
|
void configure_curl_handle(CURL *curl, const char *full_url,
|
||||||
|
MemoryBuffer *chunk, struct curl_slist *headers) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, full_url);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
||||||
|
|
||||||
|
apply_proxy_settings(curl);
|
||||||
|
}
|
||||||
|
|
||||||
|
char *build_search_url(const char *base_url, const char *page_param,
|
||||||
|
int page_multiplier, int page_base,
|
||||||
|
const char *encoded_query, int page) {
|
||||||
|
int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
|
||||||
|
char *url = malloc(BUFFER_SIZE_LARGE);
|
||||||
|
if (!url)
|
||||||
|
return NULL;
|
||||||
|
snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
|
||||||
|
page_param, page_value);
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct curl_slist *build_request_headers(const char *host_header,
|
||||||
|
const char *referer) {
|
||||||
|
struct curl_slist *headers = NULL;
|
||||||
|
char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
|
||||||
|
|
||||||
|
snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
|
||||||
|
snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
|
||||||
|
|
||||||
|
headers = curl_slist_append(headers, host_buf);
|
||||||
|
headers = curl_slist_append(headers, ref_buf);
|
||||||
|
headers = curl_slist_append(
|
||||||
|
headers,
|
||||||
|
"Accept: "
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
||||||
|
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
|
||||||
|
headers = curl_slist_append(headers, "DNT: 1");
|
||||||
|
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
|
void http_delay(void) {
|
||||||
|
usleep(HTTP_DELAY_MIN_US + (rand() % HTTP_DELAY_RANGE_US));
|
||||||
|
}
|
||||||
269
src/Scraping/ScrapingParsers.c
Normal file
269
src/Scraping/ScrapingParsers.c
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
#include "../Utility/Unescape.h"
|
||||||
|
#include "../Utility/XmlHelper.h"
|
||||||
|
#include "Config.h"
|
||||||
|
#include "Scraping.h"
|
||||||
|
#include <libxml/HTMLparser.h>
|
||||||
|
#include <libxml/xpath.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) {
|
||||||
|
return xmlXPathNewContext(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) {
|
||||||
|
if (obj)
|
||||||
|
xmlXPathFreeObject(obj);
|
||||||
|
if (ctx)
|
||||||
|
xmlXPathFreeContext(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchResult *alloc_results_array(int capacity, int max_results) {
|
||||||
|
int count = capacity < max_results ? capacity : max_results;
|
||||||
|
return xml_result_alloc(capacity, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
|
||||||
|
int unescape) {
|
||||||
|
result->url = unescape ? unescape_search_url(url) : strdup(url ? url : "");
|
||||||
|
result->title = strdup(title ? title : "No Title");
|
||||||
|
result->snippet = strdup(snippet ? snippet : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_xml_node_list(char *title, char *url, char *snippet) {
|
||||||
|
if (title)
|
||||||
|
xmlFree(title);
|
||||||
|
if (url)
|
||||||
|
xmlFree(url);
|
||||||
|
if (snippet)
|
||||||
|
xmlFree(snippet);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
|
||||||
|
SearchResult **out_results, int max_results) {
|
||||||
|
(void)engine_name;
|
||||||
|
int found_count = 0;
|
||||||
|
|
||||||
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
||||||
|
if (!ctx)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr obj =
|
||||||
|
xml_xpath_eval(ctx, "//tr[not(contains(@class, "
|
||||||
|
"'result-sponsored'))]//a[@class='result-link']");
|
||||||
|
|
||||||
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_links = obj->nodesetval->nodeNr;
|
||||||
|
*out_results = alloc_results_array(num_links, max_results);
|
||||||
|
if (!*out_results) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_links && found_count < max_results; i++) {
|
||||||
|
xmlNodePtr link_node = obj->nodesetval->nodeTab[i];
|
||||||
|
char *title = xml_node_content(link_node);
|
||||||
|
char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href");
|
||||||
|
char *snippet_text = NULL;
|
||||||
|
|
||||||
|
xmlNodePtr current = link_node->parent;
|
||||||
|
while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
|
||||||
|
current = current->parent;
|
||||||
|
|
||||||
|
if (current && current->next) {
|
||||||
|
xmlNodePtr snippet_row = current->next;
|
||||||
|
while (snippet_row &&
|
||||||
|
xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0)
|
||||||
|
snippet_row = snippet_row->next;
|
||||||
|
if (snippet_row) {
|
||||||
|
ctx->node = snippet_row;
|
||||||
|
xmlXPathObjectPtr s_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//td[@class='result-snippet']");
|
||||||
|
if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0)
|
||||||
|
snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]);
|
||||||
|
if (s_obj)
|
||||||
|
xmlXPathFreeObject(s_obj);
|
||||||
|
ctx->node = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
|
||||||
|
free_xml_node_list(title, url, snippet_text);
|
||||||
|
found_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return found_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int parse_startpage(const char *engine_name, xmlDocPtr doc,
|
||||||
|
SearchResult **out_results, int max_results) {
|
||||||
|
(void)engine_name;
|
||||||
|
int found_count = 0;
|
||||||
|
|
||||||
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
||||||
|
if (!ctx)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr obj =
|
||||||
|
xml_xpath_eval(ctx, "//div[contains(@class, 'result')]");
|
||||||
|
|
||||||
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_results = obj->nodesetval->nodeNr;
|
||||||
|
*out_results = alloc_results_array(num_results, max_results);
|
||||||
|
if (!*out_results) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
||||||
|
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
|
||||||
|
ctx->node = result_node;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr link_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]");
|
||||||
|
char *url =
|
||||||
|
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
|
||||||
|
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
|
||||||
|
(xmlChar *)"href")
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr title_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]");
|
||||||
|
char *title = (title_obj && title_obj->nodesetval &&
|
||||||
|
title_obj->nodesetval->nodeNr > 0)
|
||||||
|
? xml_node_content(title_obj->nodesetval->nodeTab[0])
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr snippet_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]");
|
||||||
|
char *snippet_text =
|
||||||
|
(snippet_obj && snippet_obj->nodesetval &&
|
||||||
|
snippet_obj->nodesetval->nodeNr > 0)
|
||||||
|
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
if (url && title) {
|
||||||
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 0);
|
||||||
|
found_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
free_xml_node_list(title, url, snippet_text);
|
||||||
|
if (link_obj)
|
||||||
|
xmlXPathFreeObject(link_obj);
|
||||||
|
if (title_obj)
|
||||||
|
xmlXPathFreeObject(title_obj);
|
||||||
|
if (snippet_obj)
|
||||||
|
xmlXPathFreeObject(snippet_obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->node = NULL;
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return found_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
|
||||||
|
SearchResult **out_results, int max_results) {
|
||||||
|
(void)engine_name;
|
||||||
|
int found_count = 0;
|
||||||
|
|
||||||
|
xmlXPathContextPtr ctx = create_xpath_context(doc);
|
||||||
|
if (!ctx)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr obj =
|
||||||
|
xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]");
|
||||||
|
|
||||||
|
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_results = obj->nodesetval->nodeNr;
|
||||||
|
*out_results = alloc_results_array(num_results, max_results);
|
||||||
|
if (!*out_results) {
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
||||||
|
xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
|
||||||
|
ctx->node = result_node;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr link_obj = xml_xpath_eval(
|
||||||
|
ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
|
||||||
|
char *url =
|
||||||
|
(link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
|
||||||
|
? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
|
||||||
|
(xmlChar *)"href")
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr title_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]");
|
||||||
|
char *title = (title_obj && title_obj->nodesetval &&
|
||||||
|
title_obj->nodesetval->nodeNr > 0)
|
||||||
|
? xml_node_content(title_obj->nodesetval->nodeTab[0])
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
xmlXPathObjectPtr snippet_obj =
|
||||||
|
xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p");
|
||||||
|
char *snippet_text =
|
||||||
|
(snippet_obj && snippet_obj->nodesetval &&
|
||||||
|
snippet_obj->nodesetval->nodeNr > 0)
|
||||||
|
? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
|
||||||
|
: NULL;
|
||||||
|
|
||||||
|
if (url && title) {
|
||||||
|
assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
|
||||||
|
found_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
free_xml_node_list(title, url, snippet_text);
|
||||||
|
if (link_obj)
|
||||||
|
xmlXPathFreeObject(link_obj);
|
||||||
|
if (title_obj)
|
||||||
|
xmlXPathFreeObject(title_obj);
|
||||||
|
if (snippet_obj)
|
||||||
|
xmlXPathFreeObject(snippet_obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->node = NULL;
|
||||||
|
free_xpath_objects(ctx, obj);
|
||||||
|
return found_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SearchEngine ENGINE_REGISTRY[] = {
|
||||||
|
{.name = "DuckDuckGo Lite",
|
||||||
|
.base_url = "https://lite.duckduckgo.com/lite/?q=",
|
||||||
|
.host_header = "lite.duckduckgo.com",
|
||||||
|
.referer = "https://lite.duckduckgo.com/",
|
||||||
|
.page_param = "s",
|
||||||
|
.page_multiplier = 30,
|
||||||
|
.page_base = 0,
|
||||||
|
.parser = parse_ddg_lite},
|
||||||
|
{.name = "Startpage",
|
||||||
|
.base_url = "https://www.startpage.com/sp/search?q=",
|
||||||
|
.host_header = "www.startpage.com",
|
||||||
|
.referer = "https://www.startpage.com/",
|
||||||
|
.page_param = "page",
|
||||||
|
.page_multiplier = 1,
|
||||||
|
.page_base = 1,
|
||||||
|
.parser = parse_startpage},
|
||||||
|
{.name = "Yahoo",
|
||||||
|
.base_url = "https://search.yahoo.com/search?p=",
|
||||||
|
.host_header = "search.yahoo.com",
|
||||||
|
.referer = "https://search.yahoo.com/",
|
||||||
|
.page_param = "b",
|
||||||
|
.page_multiplier = 10,
|
||||||
|
.page_base = 1,
|
||||||
|
.parser = parse_yahoo}};
|
||||||
|
|
||||||
|
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
|
||||||
Reference in New Issue
Block a user