<?php
namespace AffiliateHub\Modules\LinkScanner;

class Scanner {
    /**
     * Scan all affiliate links destinations
     * Returns number of affiliate links queued
     */
    public function queue_affiliate_links($scan_id) {
        global $wpdb;
        
        // Check if scan has been canceled before queuing
        $db = new DB();
        $scan = $db->get_scan($scan_id);
        if ($scan && isset($scan->canceled) && $scan->canceled) {
            return 0;
        }

        $total = 0;
        
        // Get all published affiliate links with their destination URLs
        $affiliate_links = get_posts(array(
            'post_type' => \AffiliateHub\Core\Constants::POST_TYPE_AFFILIATE_LINK,
            'post_status' => 'publish',
            'posts_per_page' => -1,
            'fields' => 'ids'
        ));
        
        foreach ($affiliate_links as $post_id) {
            // Get the destination URL from meta
            $destination_url = get_post_meta($post_id, \AffiliateHub\Core\Constants::META_DESTINATION_URL, true);
            
            if (empty($destination_url) || !filter_var($destination_url, FILTER_VALIDATE_URL)) {
                // Skip invalid or empty URLs
                $db->insert_link($scan_id, $post_id, $destination_url ?: 'N/A', get_the_title($post_id), '', 'Invalid or empty destination URL', 'skipped', 1);
                continue;
            }
            
            // Check if this URL should be excluded
            if ($this->is_excluded($destination_url)) {
                $db->insert_link($scan_id, $post_id, $destination_url, get_the_title($post_id), '', 'URL excluded by whitelist/blacklist settings', 'skipped', 1);
                continue;
            }
            
            // Insert as pending for scan
            $affiliate_link_title = get_the_title($post_id);
            $snippet = sprintf(__('Affiliate Link: %s → %s', 'affiliate-hub'), $affiliate_link_title, $destination_url);
            
            $db->insert_link($scan_id, $post_id, $destination_url, $affiliate_link_title, $snippet);
            $total++;
            
            // Check if scan has been canceled during processing
            if ($total % 10 === 0) {
                $scan = $db->get_scan($scan_id);
                if ($scan && isset($scan->canceled) && $scan->canceled) {
                    break;
                }
            }
        }

        return $total;
    }

    /**
     * Scan all public posts and queue links into DB for a scan
     * Returns number of links queued
     */
    public function queue_all_posts($scan_id, $post_types = array()) {
        global $wpdb;
        // If no post_types provided, default to 'post' and 'page'
        if (empty($post_types) || !is_array($post_types)) {
            $post_types = array('post', 'page');
        }
        $per_page = 100;
        $page = 1;
        $total = 0;

        // Check if scan has been canceled before queuing
        $db = new DB();
        $scan = $db->get_scan($scan_id);
        if ($scan && isset($scan->canceled) && $scan->canceled) {
            return 0;
        }

        while (true) {
            $args = array(
                'post_type' => $post_types,
                'post_status' => 'publish',
                'posts_per_page' => $per_page,
                'paged' => $page,
                'fields' => 'ids',
            );

            $query = new \WP_Query($args);
            if (empty($query->posts)) {
                break;
            }

            $db = new DB();
            foreach ($query->posts as $post_id) {
                $content = \get_post_field('post_content', $post_id);
                $links = $this->extract_links($content);
                foreach ($links as $link) {
                    // If excluded, record with a note explaining why and mark ignored
                    if (!empty($link['excluded']) && !empty($link['reason'])) {
                        $db->insert_link($scan_id, $post_id, $link['url'], $link['text'], $link['snippet'], $link['reason'], 'skipped', 1);
                    } else {
                        // Insert as pending
                        $db->insert_link($scan_id, $post_id, $link['url'], $link['text'], $link['snippet']);
                        $total++;
                    }
                }
            }

            $page++;
            \wp_reset_postdata();
        }

    // Update scan total count
    $db->update_scan($scan_id, array('total_urls' => $total));
    return $total;
    }

    public function extract_links($html) {
        $results = array();
        if (empty($html)) {
            return $results;
        }

    libxml_use_internal_errors(true);
        $doc = new \DOMDocument();
        $doc->loadHTML('<?xml encoding="utf-8" ?>' . $html);
        $tags = $doc->getElementsByTagName('a');
        foreach ($tags as $tag) {
            $href = $tag->getAttribute('href');
            if (empty($href)) continue;
            $text = trim($tag->textContent);
            $snippet = substr(strip_tags($tag->C14N()), 0, 200);
            $url = $this->normalize_url($href);
            $excluded = false;
            $reason = '';
            // is_excluded can return a string reason or boolean
            $ex = $this->is_excluded($url);
            if ($ex === true) {
                $excluded = true;
                $reason = 'excluded';
            } elseif (is_string($ex) && !empty($ex)) {
                $excluded = true;
                $reason = $ex;
            }

            $row = array('url' => $url, 'text' => $text, 'snippet' => $snippet);
            if ($excluded) {
                $row['excluded'] = true;
                $row['reason'] = $reason;
            }
            $results[] = $row;
        }

        return $results;
    }

    private function normalize_url($url) {
        // Basic normalization: remove fragments, trim
        $url = trim($url);
        $parts = parse_url($url);
        if ($parts === false) return $url;
        $scheme = isset($parts['scheme']) ? $parts['scheme'] : '';
        if (empty($scheme)) {
            // relative URL, make absolute against site_url
            $url = \site_url($url);
        }
        $url = preg_replace('/#.*$/', '', $url);
        return $url;
    }

    private function is_excluded($url) {
        // Check blacklist and private IPs; return reason string or false
        $blacklist = \get_option(\AffiliateHub\Core\Constants::OPTION_LINK_SCANNER_BLACKLIST, '');
        $whitelist = \get_option(\AffiliateHub\Core\Constants::OPTION_LINK_SCANNER_WHITELIST, '');

        $host = parse_url($url, PHP_URL_HOST);
        if (empty($host)) return 'no_host';

        // Normalize lists
        $bl = array_map('trim', explode(',', $blacklist));
        $wl = array_filter(array_map('trim', explode(',', $whitelist)));

        // If whitelist provided and host not in it => exclude
        if (!empty($wl) && !in_array($host, $wl, true)) {
            return 'not_in_whitelist';
        }

        // Exact host match blacklist
        foreach ($bl as $b) {
            if (empty($b)) continue;
            if (stripos($host, $b) !== false) return 'blacklist';
        }

        // Private IP detection
        $ip = gethostbyname($host);
        if (filter_var($ip, FILTER_VALIDATE_IP)) {
            if ($this->is_private_ip($ip)) return 'private_ip';
        }

        return false;
    }

    private function is_private_ip($ip) {
        // CIDR ranges for private networks
        $private_ranges = array(
            '10.0.0.0/8',
            '172.16.0.0/12',
            '192.168.0.0/16',
            '127.0.0.0/8',
            '::1/128'
        );

        foreach ($private_ranges as $range) {
            list($subnet, $bits) = explode('/', $range);
            if (strpos($subnet, ':') !== false) {
                // IPv6 - simple ::1 check
                if ($ip === '::1' && $subnet === '::1') return true;
                continue;
            }
            $mask = ~((1 << (32 - $bits)) - 1);
            if ((ip2long($ip) & $mask) === (ip2long($subnet) & $mask)) return true;
        }
        return false;
    }
}
