import os
import math
import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom # For pretty printing XML

# --- Configuration ---
SOURCE_HTML_DIR = r"./jobs"
OUTPUT_DIR = r"./"
BASE_URL = "https://amazon.redjobs.co.com/jobs/"
MAX_URLS_PER_SITEMAP = 5000
SITEMAP_FILENAME_PREFIX = "sitemap_jobs"
SITEMAP_INDEX_FILENAME = "sitemap_index.xml"

INCLUDE_CHANGEFREQ_PRIORITY = True
DEFAULT_CHANGEFREQ = "daily"
DEFAULT_PRIORITY = "0.8"

# --- NEW: Configuration for Homepage URL ---
INCLUDE_HOMEPAGE = True # Set to True to add the base URL to the sitemap
HOMEPAGE_CHANGEFREQ = "daily"  # Homepage is usually important and updated often
HOMEPAGE_PRIORITY = "1.0"      # Priority for homepage should be the highest (1.0)

# --- Configuration to force current date for lastmod ---
FORCE_CURRENT_DATE_FOR_LASTMOD = True

# --- Configuration to only include files modified today ---
SITEMAP_ONLY_TODAY_URLS = False

# --- End Configuration ---

# --- Helper Functions ---
def find_html_files_generator(directory):
    """
    A generator that yields HTML file paths one by one using os.scandir for memory efficiency.
    """
    print(f"Scanning directory: {directory}")
    try:
        for entry in os.scandir(directory):
            if entry.is_file() and entry.name.lower().endswith(".html"):
                yield entry.path
    except FileNotFoundError:
        print(f"Error: Source directory not found: {directory}")
    except Exception as e:
        print(f"Error scanning directory {directory}: {e}")

def format_datetime_w3c(dt_object):
    """Formats a datetime object into W3C Datetime format (UTC)."""
    return dt_object.strftime('%Y-%m-%dT%H:%M:%SZ')

def prettify_xml(elem):
    """Returns a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding='utf-8')

# --- Main Logic ---
def generate_sitemaps():
    if not os.path.exists(OUTPUT_DIR):
        print(f"Output directory '{OUTPUT_DIR}' not found. Creating it...")
        try:
            os.makedirs(OUTPUT_DIR)
            print(f"Created output directory: {OUTPUT_DIR}")
        except OSError as e:
            print(f"Error creating output directory '{OUTPUT_DIR}': {e}")
            return

    current_utc_datetime = datetime.datetime.now(datetime.timezone.utc)
    current_w3c_time_str = format_datetime_w3c(current_utc_datetime)
    current_utc_date = current_utc_datetime.date()

    if FORCE_CURRENT_DATE_FOR_LASTMOD:
        print(f"WARNING: Forcing all <lastmod> dates to current generation time: {current_w3c_time_str}")

    if SITEMAP_ONLY_TODAY_URLS:
        print(f"WARNING: SITEMAP_ONLY_TODAY_URLS is enabled. Only including files modified on {current_utc_date} (UTC).")

    html_file_generator = find_html_files_generator(SOURCE_HTML_DIR)
    generated_sitemap_files = []
    total_files_processed = 0
    total_files_skipped = 0
    sitemap_num = 1
    
    # NEW: Derive homepage URL from BASE_URL
    homepage_url = "/".join(BASE_URL.split("/")[:3]) + "/"

    while True:
        urlset_attrs = {"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
        urlset = ET.Element("urlset", attrib=urlset_attrs)
        
        file_count_in_batch = 0
        
        # --- NEW: Add homepage to the very first sitemap file ---
        if sitemap_num == 1 and INCLUDE_HOMEPAGE:
            print("\nAdding homepage to the sitemap...")
            url_element = ET.SubElement(urlset, "url")
            ET.SubElement(url_element, "loc").text = homepage_url
            ET.SubElement(url_element, "lastmod").text = current_w3c_time_str # Use current time
            ET.SubElement(url_element, "changefreq").text = HOMEPAGE_CHANGEFREQ
            ET.SubElement(url_element, "priority").text = HOMEPAGE_PRIORITY
            file_count_in_batch += 1
            # We don't increment total_files_processed here to keep the file count accurate
            print(f"Homepage <{homepage_url}> added with priority {HOMEPAGE_PRIORITY}.")
        # --- End of new section ---
        
        # Check if this is the first file being added to a new sitemap
        first_file_in_batch = True

        for html_filepath in html_file_generator:
            try:
                if SITEMAP_ONLY_TODAY_URLS:
                    last_mod_timestamp = os.path.getmtime(html_filepath)
                    file_mod_date = datetime.datetime.utcfromtimestamp(last_mod_timestamp).date()
                    if file_mod_date != current_utc_date:
                        total_files_skipped += 1
                        continue

                if first_file_in_batch: # Print header only when starting a new file
                     print(f"\nGenerating {SITEMAP_FILENAME_PREFIX}{sitemap_num}.xml...")
                     first_file_in_batch = False

                filename_with_ext = os.path.basename(html_filepath)
                loc_url = f"{BASE_URL}{filename_with_ext}"

                if FORCE_CURRENT_DATE_FOR_LASTMOD:
                    last_mod_w3c = current_w3c_time_str
                else:
                    last_mod_timestamp = os.path.getmtime(html_filepath)
                    dt_object = datetime.datetime.utcfromtimestamp(last_mod_timestamp)
                    last_mod_w3c = format_datetime_w3c(dt_object)

                url_element = ET.SubElement(urlset, "url")
                ET.SubElement(url_element, "loc").text = loc_url
                ET.SubElement(url_element, "lastmod").text = last_mod_w3c
                
                if INCLUDE_CHANGEFREQ_PRIORITY:
                    ET.SubElement(url_element, "changefreq").text = DEFAULT_CHANGEFREQ
                    ET.SubElement(url_element, "priority").text = DEFAULT_PRIORITY
                
                file_count_in_batch += 1
                total_files_processed += 1
                
            except Exception as e:
                print(f"  Warning: Could not process file '{html_filepath}' for sitemap entry: {e}")

            if file_count_in_batch >= MAX_URLS_PER_SITEMAP:
                break
        
        # Check if any URLs (including homepage) were added to the batch
        if file_count_in_batch > 0:
            sitemap_filename = f"{SITEMAP_FILENAME_PREFIX}{sitemap_num}.xml"
            sitemap_filepath = os.path.join(OUTPUT_DIR, sitemap_filename)
            try:
                xml_bytes = prettify_xml(urlset)
                with open(sitemap_filepath, "wb") as f:
                    f.write(xml_bytes)
                
                # Adjust count message if homepage was the only entry
                url_count_message = f"{file_count_in_batch} URL"
                if file_count_in_batch > 1:
                    url_count_message += "s"
                if sitemap_num == 1 and INCLUDE_HOMEPAGE:
                     url_count_message += " (including homepage)"

                print(f"Successfully created '{sitemap_filepath}' with {url_count_message}.")
                generated_sitemap_files.append(sitemap_filename)
                sitemap_num += 1
            except Exception as e:
                print(f"Error writing sitemap file '{sitemap_filepath}': {e}")
        else:
            break

    # The rest of the script for generating the sitemap index remains the same
    print(f"\nTotal HTML files processed: {total_files_processed}")
    if SITEMAP_ONLY_TODAY_URLS:
        print(f"Total HTML files skipped (not modified today): {total_files_skipped}")

    if not generated_sitemap_files:
        print("\nNo individual sitemaps were generated. Cannot create sitemap index.")
        if INCLUDE_HOMEPAGE and total_files_processed == 0:
             print("Note: The homepage was configured to be included, but no sitemap was generated because no other HTML files were found.")
        return

    # --- Generate Sitemap Index ---
    index_filepath = os.path.join(OUTPUT_DIR, SITEMAP_INDEX_FILENAME)
    print(f"\nGenerating sitemap index file: {index_filepath}...")

    sitemapindex = ET.Element("sitemapindex", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    domain_base = "/".join(BASE_URL.split("/")[:3]) # More robust way to get base domain

    for sitemap_file_name in generated_sitemap_files:
        try:
            sitemap_loc_url = f"{domain_base}/{sitemap_file_name}"
            
            sitemap_last_mod_w3c_for_index_entry = ""
            if FORCE_CURRENT_DATE_FOR_LASTMOD:
                sitemap_last_mod_w3c_for_index_entry = current_w3c_time_str
            else:
                individual_sitemap_full_path = os.path.join(OUTPUT_DIR, sitemap_file_name)
                sitemap_last_mod_timestamp = os.path.getmtime(individual_sitemap_full_path)
                dt_object = datetime.datetime.utcfromtimestamp(sitemap_last_mod_timestamp)
                sitemap_last_mod_w3c_for_index_entry = format_datetime_w3c(dt_object)

            sitemap_element = ET.SubElement(sitemapindex, "sitemap")
            ET.SubElement(sitemap_element, "loc").text = sitemap_loc_url
            ET.SubElement(sitemap_element, "lastmod").text = sitemap_last_mod_w3c_for_index_entry

        except Exception as e:
            print(f"  Warning: Could not process sitemap index entry for '{sitemap_file_name}': {e}")

    if len(generated_sitemap_files) > 0:
        try:
            xml_bytes_index = prettify_xml(sitemapindex)
            with open(index_filepath, "wb") as f:
                f.write(xml_bytes_index)
            print(f"Successfully created sitemap index '{index_filepath}'.")
        except Exception as e:
            print(f"Error writing sitemap index file '{index_filepath}': {e}")
    else:
        print("Sitemap index not created as no individual sitemaps were generated.")

    print("\nSitemap generation process finished.")
    print("IMPORTANT: Upload ALL generated .xml files to the root directory of your website.")
    print(f"Then submit ONLY the sitemap index URL to search engines: {domain_base}/{SITEMAP_INDEX_FILENAME}")

if __name__ == "__main__":
    if not os.path.isdir(SOURCE_HTML_DIR):
         print("="*50)
         print("!!! PLEASE EDIT THE SCRIPT CONFIGURATION !!!")
         print(f"The SOURCE_HTML_DIR ('{SOURCE_HTML_DIR}') does not seem to be a valid directory.")
         print("Please update it to the correct path where your HTML files are located.")
         print("Also check 'OUTPUT_DIR' and 'BASE_URL'.")
         print("="*50)
    elif not BASE_URL.startswith("http"):
         print("="*50)
         print("!!! PLEASE EDIT THE SCRIPT CONFIGURATION !!!")
         print(f"The BASE_URL ('{BASE_URL}') does not look like a valid absolute URL.")
         print("It should start with http:// or https://")
         print("="*50)
    else:
        generate_sitemaps()