162 lines
5.2 KiB
C#
162 lines
5.2 KiB
C#
using Creative.Configuration;
|
|
using Creative.Models;
|
|
using HtmlAgilityPack;
|
|
using Microsoft.Extensions.Options;
|
|
|
|
namespace Creative.Services;
|
|
|
|
/// <summary>
|
|
/// Scrapes a URL and extracts structured business data.
|
|
/// Supports emulated mode for development without network calls.
|
|
/// </summary>
|
|
public class ScraperService
|
|
{
|
|
private readonly CreativeConfig _config;
|
|
private readonly IHttpClientFactory _httpFactory;
|
|
private readonly ILogger<ScraperService> _logger;
|
|
|
|
public ScraperService(
|
|
IOptions<CreativeConfig> config,
|
|
IHttpClientFactory httpFactory,
|
|
ILogger<ScraperService> logger)
|
|
{
|
|
_config = config.Value;
|
|
_httpFactory = httpFactory;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Analyze a URL - scrape and extract structured content.
|
|
/// </summary>
|
|
public async Task<UrlAnalysis> AnalyzeUrlAsync(string url, CancellationToken ct)
|
|
{
|
|
if (!_config.EnableRealApi)
|
|
return EmulateAnalysis(url);
|
|
|
|
return await ScrapeRealAsync(url, ct);
|
|
}
|
|
|
|
#region Real Implementation
|
|
|
|
private async Task<UrlAnalysis> ScrapeRealAsync(string url, CancellationToken ct)
|
|
{
|
|
_logger.LogInformation("[Scraper] Fetching {Url}", url);
|
|
|
|
var client = _httpFactory.CreateClient();
|
|
client.Timeout = TimeSpan.FromSeconds(_config.ScrapeTimeoutSeconds);
|
|
client.DefaultRequestHeaders.UserAgent.ParseAdd(
|
|
"Mozilla/5.0 (compatible; AdPlatformBot/1.0)");
|
|
|
|
var html = await client.GetStringAsync(url, ct);
|
|
|
|
var doc = new HtmlDocument();
|
|
doc.LoadHtml(html);
|
|
|
|
// Extract title
|
|
var title = doc.DocumentNode
|
|
.SelectSingleNode("//title")?.InnerText?.Trim();
|
|
|
|
// Extract meta description
|
|
var metaDesc = doc.DocumentNode
|
|
.SelectSingleNode("//meta[@name='description']")?
|
|
.GetAttributeValue("content", null)?.Trim();
|
|
|
|
// Extract H1-H3 headings
|
|
var headings = new List<string>();
|
|
foreach (var tag in new[] { "h1", "h2", "h3" })
|
|
{
|
|
var nodes = doc.DocumentNode.SelectNodes($"//{tag}");
|
|
if (nodes != null)
|
|
{
|
|
foreach (var node in nodes.Take(5))
|
|
{
|
|
var text = HtmlEntity.DeEntitize(node.InnerText).Trim();
|
|
if (!string.IsNullOrWhiteSpace(text))
|
|
headings.Add(text);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract body text snippet (first meaningful paragraphs)
|
|
var bodySnippet = ExtractBodySnippet(doc);
|
|
|
|
_logger.LogInformation("[Scraper] Extracted: title={Title} headings={Count}",
|
|
title?.Length > 40 ? title[..40] + "..." : title, headings.Count);
|
|
|
|
return new UrlAnalysis
|
|
{
|
|
Url = url,
|
|
Title = title,
|
|
MetaDescription = metaDesc,
|
|
Headings = headings,
|
|
BodySnippet = bodySnippet,
|
|
InferredCategory = null, // Category inference handled by CopyGenerator
|
|
ScrapedAt = DateTimeOffset.UtcNow
|
|
};
|
|
}
|
|
|
|
private static string? ExtractBodySnippet(HtmlDocument doc)
|
|
{
|
|
// Remove script/style nodes
|
|
var removeNodes = doc.DocumentNode.SelectNodes("//script|//style|//nav|//footer|//header");
|
|
if (removeNodes != null)
|
|
{
|
|
foreach (var node in removeNodes)
|
|
node.Remove();
|
|
}
|
|
|
|
var paragraphs = doc.DocumentNode.SelectNodes("//p");
|
|
if (paragraphs == null) return null;
|
|
|
|
var texts = paragraphs
|
|
.Select(p => HtmlEntity.DeEntitize(p.InnerText).Trim())
|
|
.Where(t => t.Length > 30)
|
|
.Take(3);
|
|
|
|
var snippet = string.Join(" ", texts);
|
|
return snippet.Length > 500 ? snippet[..500] : snippet;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Emulated
|
|
|
|
private UrlAnalysis EmulateAnalysis(string url)
|
|
{
|
|
_logger.LogInformation("[Scraper] Emulated analysis for {Url}", url);
|
|
|
|
// Parse domain for realistic emulated data
|
|
var domain = "example.com";
|
|
try
|
|
{
|
|
var uri = new Uri(url.StartsWith("http") ? url : $"https://{url}");
|
|
domain = uri.Host.Replace("www.", "");
|
|
}
|
|
catch { /* use default */ }
|
|
|
|
var businessName = domain.Split('.')[0];
|
|
var titleCase = char.ToUpper(businessName[0]) + businessName[1..];
|
|
|
|
return new UrlAnalysis
|
|
{
|
|
Url = url,
|
|
Title = $"{titleCase} - Quality Products & Services",
|
|
MetaDescription = $"{titleCase} offers premium products and services. Visit us today for the best experience.",
|
|
Headings = new List<string>
|
|
{
|
|
$"Welcome to {titleCase}",
|
|
"Our Services",
|
|
"Why Choose Us",
|
|
"Contact Us Today"
|
|
},
|
|
BodySnippet = $"{titleCase} has been serving customers with dedication and quality. " +
|
|
"We offer a wide range of products and services designed to meet your needs. " +
|
|
"Our team is committed to providing exceptional value and customer satisfaction.",
|
|
InferredCategory = "Business Services",
|
|
ScrapedAt = DateTimeOffset.UtcNow
|
|
};
|
|
}
|
|
|
|
#endregion
|
|
}
|