Files
AdPlatform-Server/Creative/Services/ScraperService.cs
2026-03-14 13:50:09 -07:00

162 lines
5.2 KiB
C#

using Creative.Configuration;
using Creative.Models;
using HtmlAgilityPack;
using Microsoft.Extensions.Options;
namespace Creative.Services;
/// <summary>
/// Scrapes a URL and extracts structured business data.
/// Supports emulated mode for development without network calls.
/// </summary>
public class ScraperService
{
private readonly CreativeConfig _config;
private readonly IHttpClientFactory _httpFactory;
private readonly ILogger<ScraperService> _logger;
public ScraperService(
IOptions<CreativeConfig> config,
IHttpClientFactory httpFactory,
ILogger<ScraperService> logger)
{
_config = config.Value;
_httpFactory = httpFactory;
_logger = logger;
}
/// <summary>
/// Analyze a URL - scrape and extract structured content.
/// </summary>
public async Task<UrlAnalysis> AnalyzeUrlAsync(string url, CancellationToken ct)
{
if (!_config.EnableRealApi)
return EmulateAnalysis(url);
return await ScrapeRealAsync(url, ct);
}
#region Real Implementation
private async Task<UrlAnalysis> ScrapeRealAsync(string url, CancellationToken ct)
{
_logger.LogInformation("[Scraper] Fetching {Url}", url);
var client = _httpFactory.CreateClient();
client.Timeout = TimeSpan.FromSeconds(_config.ScrapeTimeoutSeconds);
client.DefaultRequestHeaders.UserAgent.ParseAdd(
"Mozilla/5.0 (compatible; AdPlatformBot/1.0)");
var html = await client.GetStringAsync(url, ct);
var doc = new HtmlDocument();
doc.LoadHtml(html);
// Extract title
var title = doc.DocumentNode
.SelectSingleNode("//title")?.InnerText?.Trim();
// Extract meta description
var metaDesc = doc.DocumentNode
.SelectSingleNode("//meta[@name='description']")?
.GetAttributeValue("content", null)?.Trim();
// Extract H1-H3 headings
var headings = new List<string>();
foreach (var tag in new[] { "h1", "h2", "h3" })
{
var nodes = doc.DocumentNode.SelectNodes($"//{tag}");
if (nodes != null)
{
foreach (var node in nodes.Take(5))
{
var text = HtmlEntity.DeEntitize(node.InnerText).Trim();
if (!string.IsNullOrWhiteSpace(text))
headings.Add(text);
}
}
}
// Extract body text snippet (first meaningful paragraphs)
var bodySnippet = ExtractBodySnippet(doc);
_logger.LogInformation("[Scraper] Extracted: title={Title} headings={Count}",
title?.Length > 40 ? title[..40] + "..." : title, headings.Count);
return new UrlAnalysis
{
Url = url,
Title = title,
MetaDescription = metaDesc,
Headings = headings,
BodySnippet = bodySnippet,
InferredCategory = null, // Category inference handled by CopyGenerator
ScrapedAt = DateTimeOffset.UtcNow
};
}
private static string? ExtractBodySnippet(HtmlDocument doc)
{
// Remove script/style nodes
var removeNodes = doc.DocumentNode.SelectNodes("//script|//style|//nav|//footer|//header");
if (removeNodes != null)
{
foreach (var node in removeNodes)
node.Remove();
}
var paragraphs = doc.DocumentNode.SelectNodes("//p");
if (paragraphs == null) return null;
var texts = paragraphs
.Select(p => HtmlEntity.DeEntitize(p.InnerText).Trim())
.Where(t => t.Length > 30)
.Take(3);
var snippet = string.Join(" ", texts);
return snippet.Length > 500 ? snippet[..500] : snippet;
}
#endregion
#region Emulated
private UrlAnalysis EmulateAnalysis(string url)
{
_logger.LogInformation("[Scraper] Emulated analysis for {Url}", url);
// Parse domain for realistic emulated data
var domain = "example.com";
try
{
var uri = new Uri(url.StartsWith("http") ? url : $"https://{url}");
domain = uri.Host.Replace("www.", "");
}
catch { /* use default */ }
var businessName = domain.Split('.')[0];
var titleCase = char.ToUpper(businessName[0]) + businessName[1..];
return new UrlAnalysis
{
Url = url,
Title = $"{titleCase} - Quality Products & Services",
MetaDescription = $"{titleCase} offers premium products and services. Visit us today for the best experience.",
Headings = new List<string>
{
$"Welcome to {titleCase}",
"Our Services",
"Why Choose Us",
"Contact Us Today"
},
BodySnippet = $"{titleCase} has been serving customers with dedication and quality. " +
"We offer a wide range of products and services designed to meet your needs. " +
"Our team is committed to providing exceptional value and customer satisfaction.",
InferredCategory = "Business Services",
ScrapedAt = DateTimeOffset.UtcNow
};
}
#endregion
}