using Creative.Configuration; using Creative.Models; using HtmlAgilityPack; using Microsoft.Extensions.Options; namespace Creative.Services; /// /// Scrapes a URL and extracts structured business data. /// Supports emulated mode for development without network calls. /// public class ScraperService { private readonly CreativeConfig _config; private readonly IHttpClientFactory _httpFactory; private readonly ILogger _logger; public ScraperService( IOptions config, IHttpClientFactory httpFactory, ILogger logger) { _config = config.Value; _httpFactory = httpFactory; _logger = logger; } /// /// Analyze a URL - scrape and extract structured content. /// public async Task AnalyzeUrlAsync(string url, CancellationToken ct) { if (!_config.EnableRealApi) return EmulateAnalysis(url); return await ScrapeRealAsync(url, ct); } #region Real Implementation private async Task ScrapeRealAsync(string url, CancellationToken ct) { _logger.LogInformation("[Scraper] Fetching {Url}", url); var client = _httpFactory.CreateClient(); client.Timeout = TimeSpan.FromSeconds(_config.ScrapeTimeoutSeconds); client.DefaultRequestHeaders.UserAgent.ParseAdd( "Mozilla/5.0 (compatible; AdPlatformBot/1.0)"); var html = await client.GetStringAsync(url, ct); var doc = new HtmlDocument(); doc.LoadHtml(html); // Extract title var title = doc.DocumentNode .SelectSingleNode("//title")?.InnerText?.Trim(); // Extract meta description var metaDesc = doc.DocumentNode .SelectSingleNode("//meta[@name='description']")? .GetAttributeValue("content", null)?.Trim(); // Extract H1-H3 headings var headings = new List(); foreach (var tag in new[] { "h1", "h2", "h3" }) { var nodes = doc.DocumentNode.SelectNodes($"//{tag}"); if (nodes != null) { foreach (var node in nodes.Take(5)) { var text = HtmlEntity.DeEntitize(node.InnerText).Trim(); if (!string.IsNullOrWhiteSpace(text)) headings.Add(text); } } } // Extract body text snippet (first meaningful paragraphs) var bodySnippet = ExtractBodySnippet(doc); _logger.LogInformation("[Scraper] Extracted: title={Title} headings={Count}", title?.Length > 40 ? title[..40] + "..." : title, headings.Count); return new UrlAnalysis { Url = url, Title = title, MetaDescription = metaDesc, Headings = headings, BodySnippet = bodySnippet, InferredCategory = null, // Category inference handled by CopyGenerator ScrapedAt = DateTimeOffset.UtcNow }; } private static string? ExtractBodySnippet(HtmlDocument doc) { // Remove script/style nodes var removeNodes = doc.DocumentNode.SelectNodes("//script|//style|//nav|//footer|//header"); if (removeNodes != null) { foreach (var node in removeNodes) node.Remove(); } var paragraphs = doc.DocumentNode.SelectNodes("//p"); if (paragraphs == null) return null; var texts = paragraphs .Select(p => HtmlEntity.DeEntitize(p.InnerText).Trim()) .Where(t => t.Length > 30) .Take(3); var snippet = string.Join(" ", texts); return snippet.Length > 500 ? snippet[..500] : snippet; } #endregion #region Emulated private UrlAnalysis EmulateAnalysis(string url) { _logger.LogInformation("[Scraper] Emulated analysis for {Url}", url); // Parse domain for realistic emulated data var domain = "example.com"; try { var uri = new Uri(url.StartsWith("http") ? url : $"https://{url}"); domain = uri.Host.Replace("www.", ""); } catch { /* use default */ } var businessName = domain.Split('.')[0]; var titleCase = char.ToUpper(businessName[0]) + businessName[1..]; return new UrlAnalysis { Url = url, Title = $"{titleCase} - Quality Products & Services", MetaDescription = $"{titleCase} offers premium products and services. Visit us today for the best experience.", Headings = new List { $"Welcome to {titleCase}", "Our Services", "Why Choose Us", "Contact Us Today" }, BodySnippet = $"{titleCase} has been serving customers with dedication and quality. " + "We offer a wide range of products and services designed to meet your needs. " + "Our team is committed to providing exceptional value and customer satisfaction.", InferredCategory = "Business Services", ScrapedAt = DateTimeOffset.UtcNow }; } #endregion }