dotfiles/platforms/nix-darwin/modules/nmasur/presets/services/daily-summary/process-urls.py

# Temporarily disabled in favor of using an extension to save history


import requests
from bs4 import BeautifulSoup
import json
import os
import sys
from datetime import datetime
from urllib.parse import urlparse


def find_urls(data):
    urls = []
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'url' and isinstance(value, str):
                urls.append(value)
            else:
                urls.extend(find_urls(value))
    elif isinstance(data, list):
        for item in data:
            urls.extend(find_urls(item))
    return urls


def main():
    if len(sys.argv) > 1:
        base_dir = sys.argv[1]
        if not os.path.isdir(base_dir):
            print(f"Error: Directory '{base_dir}' not found.")
            sys.exit(1)
    else:
        base_dir = '.'

    today = datetime.now().strftime('%Y.%m.%d')
    output_filename = 'todays_urls.md'
    output_filepath = os.path.join(base_dir, output_filename)
    url_titles = {}

    print(f"Searching for files in '{base_dir}' "
          f"starting with 'snapshot-{today}-'")

    with open(output_filepath, 'w') as md_file:
        md_file.write(f'# URLs from Sidebery Snapshots for '
                      f'{today.replace(".", "-")}\n\n')

        files_processed = 0
        for filename in sorted(os.listdir(base_dir)):
            # Debugging print removed
            if (filename.startswith(f'snapshot-{today}-')
                    and filename.endswith('.json')):
                files_processed += 1
                print(f"Processing file: "
                      f"{os.path.join(base_dir, filename)}")

                # Extract and format date and time from filename
                # Example: snapshot-2026.01.25-13.19.29.json
                clean_filename = (filename.replace('snapshot-', '')
                                  .replace('.json', ''))
                date_time_parts = clean_filename.split('-', 1)

                formatted_date = date_time_parts[0].replace('.', '-')
                formatted_time = date_time_parts[1].replace('.', ':')

                datetime_str = f"{formatted_date} {formatted_time}"

                md_file.write(f'## {datetime_str}\n\n')

                with open(os.path.join(base_dir, filename), 'r') as json_file:
                    try:
                        data = json.load(json_file)
                        urls = find_urls(data)
                        print(f"  Found {len(urls)} URLs")
                        for url in urls:
                            if url not in url_titles:
                                try:
                                    # Get title of URL
                                    res = requests.get(
                                       url,
                                       timeout=10,
                                       allow_redirects=True
                                    )
                                    soup = BeautifulSoup(
                                        res.text,
                                        'html.parser'
                                    )
                                    if soup.title and soup.title.string:
                                        title = soup.title.string.strip()
                                    else:
                                        domain = urlparse(url).netloc
                                        title = domain if domain else url
                                    url_titles[url] = title
                                except requests.exceptions.InvalidSchema:
                                    continue
                                except Exception:
                                    domain = urlparse(url).netloc
                                    title = domain if domain else url
                                    url_titles[url] = title
                            if url in url_titles:
                                title = url_titles[url]
                                md_file.write(f'- [{title}]({url})\n')
                        md_file.write('\n')
                    except json.JSONDecodeError:
                        print(f"  Error decoding JSON in {filename}")
                        md_file.write('- Error decoding JSON\n\n')

        if files_processed == 0:
            print("No files found for today.")

    print(f"Processing complete. Output written to {output_filepath}")


if __name__ == '__main__':
    main()