// SEO Article Extractor - Content Script

// Listen for messages from popup
chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
  if (request.action === 'extract_content') {
    const data = extractPageData(request.isPro);
    sendResponse(data);
  }
  return true; // Keep channel open
});

/**
 * Main extraction function
 */
function extractPageData(isPro = false) {
  const result = {
    valid: false,
    title: document.title,
    content: "",
    rawText: "",
    h1: [],
    headings: [],
    wordCount: 0,
    url: window.location.href,
    error: null
  };

  try {
    // 1. Find the main content container
    // Prioritize semantic tags
    let contentNode =
      document.querySelector('article') ||
      document.querySelector('main') ||
      document.querySelector('.post-content') ||
      document.querySelector('.entry-content') ||
      document.body;

    // Clone to manipulate without affecting page
    let clone = contentNode.cloneNode(true);

    // 2. Clean up logic (remove ads, nav, comments, script, style)
    const junkSelectors = [
      'nav', 'header', 'footer', 'aside',
      '.ads', '.advertisement', '.social-share',
      '.comments', '#comments', '.sidebar',
      'script', 'style', 'iframe', 'svg', 'button',
      'img', 'video', 'audio', 'figure', 'figcaption', 'noscript'
    ];

    junkSelectors.forEach(selector => {
      const junk = clone.querySelectorAll(selector);
      junk.forEach(el => el.remove());
    });

    // 3. Extract text
    // Get text content and clean whitespace
    let cleanText = clone.innerText.replace(/\s+/g, ' ').trim();

    // 4. Extract Headings (from the original doc to ensure we catch them all, 
    // but scoped to the article if possible to avoid site-wide noise)
    const headingTags = ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'];
    const nodes = contentNode.querySelectorAll(headingTags.join(','));

    let headingsList = [];
    nodes.forEach(node => {
      // Simple visibility check
      if (node.offsetParent !== null && node.innerText.trim().length > 0) {
        headingsList.push({
          tag: node.tagName,
          text: node.innerText.trim()
        });
        if (node.tagName === 'H1') result.h1.push(node.innerText.trim());
      }
    });

    // 5. Validation
    const wordCount = cleanText.split(' ').length;

    // Title Logic: Strictly Target Article H1
    let finalTitle = "";

    // 1. Try H1 inside the MAIN CONTENT container first (Best for excluding logos/headers)
    let h1Text = contentNode.querySelector('h1')?.innerText?.trim();

    // 2. Fallback to global H1 if content container didn't have one
    if (!h1Text) {
      // Filter out H1s that are likely logos (images only or very short)
      const allH1s = Array.from(document.querySelectorAll('h1'));
      const textH1 = allH1s.find(h => h.innerText.replace(/\s/g, '').length > 5);
      if (textH1) h1Text = textH1.innerText.trim();
    }

    // 3. Clean it
    const cleanH1 = h1Text ? h1Text.replace(/\s+/g, ' ') : null;

    // 4. Force strict H1 usage. If NO H1, fallback to Title but strip aggressive branding.
    if (cleanH1) {
      finalTitle = cleanH1;
    } else {
      // Fallback to page title but STRIP everything after separators to remove "DA"
      let fallback = document.title;
      fallback = fallback.split('|')[0].trim();
      fallback = fallback.split(' - ')[0].trim();
      fallback = fallback.split(' – ')[0].trim(); // En-dash
      finalTitle = fallback;
    }

    result.title = finalTitle;
    result.content = cleanText.substring(0, 1000) + (cleanText.length > 1000 ? '...' : ''); // Preview snippet
    result.rawText = cleanText; // Full text for analysis
    result.headings = headingsList;
    result.wordCount = wordCount;

    // --- v1.3 BATCH 1: LOCAL INTELLIGENCE EXTRACTION ---

    // 1. E-E-A-T Signals
    const footer = document.querySelector('footer') || document.body;
    const authorMeta = document.querySelector('meta[name="author"]')?.content;
    const privacyLink = !!document.querySelector('a[href*="privacy"]');
    const aboutLink = !!document.querySelector('a[href*="about"]');
    const contactLink = !!document.querySelector('a[href*="contact"]');
    const externalLinks = Array.from(document.querySelectorAll('a[href^="http"]'))
      .filter(a => !a.href.includes(window.location.hostname)).length;

    result.eeat = {
      hasAuthor: !!authorMeta || !!document.querySelector('.author-name') || !!document.querySelector('[rel="author"]'),
      hasPrivacy: privacyLink,
      hasAbout: aboutLink,
      hasContact: contactLink,
      externalLinks: externalLinks
    };

    // 2. Schema Intelligence (All Types)
    result.schemas = [];
    try {
      const scripts = document.querySelectorAll('script[type="application/ld+json"]');
      scripts.forEach(script => {
        try {
          const json = JSON.parse(script.innerText);
          const items = Array.isArray(json) ? json : [json];
          items.forEach(item => {
            if (item['@type']) result.schemas.push(item['@type']);
            if (item['@graph']) { // Handle @graph structure (Yoast/RankMath)
              item['@graph'].forEach(g => { if (g['@type']) result.schemas.push(g['@type']); });
            }
          });
        } catch (e) { }
      });
    } catch (e) { }

    // --- v2.1 BATCH 4: TECHNICAL SEO DATA ---

    // 1. Meta & Indexing
    result.tech = {
      canonical: document.querySelector('link[rel="canonical"]')?.href || null,
      robots: document.querySelector('meta[name="robots"]')?.content || null,
      hreflangs: Array.from(document.querySelectorAll('link[rel="alternate"][hreflang]')).map(l => ({ lang: l.getAttribute('hreflang'), href: l.href })),

      // 2. Social Tags
      ogTitle: document.querySelector('meta[property="og:title"]')?.content || null,
      ogImage: document.querySelector('meta[property="og:image"]')?.content || null,
      twitterCard: document.querySelector('meta[name="twitter:card"]')?.content || null,

      // 3. Image Audit
      images: {
        total: document.querySelectorAll('img').length,
        missingAlt: Array.from(document.querySelectorAll('img')).filter(img => !img.alt || img.alt.trim() === "").length
      },

      // 4. Meta Lengths
      metaTitleLen: document.title.length,
      metaDescription: document.querySelector('meta[name="description"]')?.content || "",
      metaDescLen: document.querySelector('meta[name="description"]')?.content?.length || 0,

      // 5. Dates
      datePublished: document.querySelector('meta[property="article:published_time"]')?.content ||
        document.querySelector('meta[name="date"]')?.content ||
        document.querySelector('time[itemprop="datePublished"]')?.getAttribute('datetime') || null,
      dateModified: document.querySelector('meta[property="article:modified_time"]')?.content || null
    };

    // Schema Fallback for Date
    if (!result.tech.datePublished) {
      // Try looking in captured schemas
      // We can't easily access result.schemas here as it was processed earlier in a simplified list.
      // Let's re-scan scripts for datePublished
      try {
        const scripts = document.querySelectorAll('script[type="application/ld+json"]');
        for (let s of scripts) {
          if (s.innerText.includes('datePublished')) {
            const json = JSON.parse(s.innerText);
            const item = Array.isArray(json) ? json.find(i => i.datePublished) : json;
            if (item && item.datePublished) {
              result.tech.datePublished = item.datePublished;
              if (item.dateModified) result.tech.dateModified = item.dateModified;
              break;
            } else if (item && item['@graph']) {
              const g = item['@graph'].find(i => i.datePublished);
              if (g) {
                result.tech.datePublished = g.datePublished;
                if (g.dateModified) result.tech.dateModified = g.dateModified;
                break;
              }
            }
          }
        }
      } catch (e) { }
    }

    // --- END BATCH 4 ---

    // Validation Logic (Modified for Pro vs Free)
    if (!isPro && wordCount < 300) {
      result.error = "Content too short for Free Plan (<300 words). Upgrade to analyze short content.";
      result.valid = false;
    } else {
      result.valid = true;
    }

  } catch (e) {
    result.error = "Extraction failed: " + e.message;
  }

  return result;
}
