Files
dotfiles/platforms/nix-darwin/modules/nmasur/presets/services/daily-summary/process-urls.py
2026-01-31 11:53:43 -05:00

113 lines
4.3 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
import os
import sys
from datetime import datetime
from urllib.parse import urlparse
def find_urls(data):
urls = []
if isinstance(data, dict):
for key, value in data.items():
if key == 'url' and isinstance(value, str):
urls.append(value)
else:
urls.extend(find_urls(value))
elif isinstance(data, list):
for item in data:
urls.extend(find_urls(item))
return urls
def main():
if len(sys.argv) > 1:
base_dir = sys.argv[1]
if not os.path.isdir(base_dir):
print(f"Error: Directory '{base_dir}' not found.")
sys.exit(1)
else:
base_dir = '.'
today = datetime.now().strftime('%Y.%m.%d')
output_filename = 'todays_urls.md'
output_filepath = os.path.join(base_dir, output_filename)
url_titles = {}
print(f"Searching for files in '{base_dir}' "
f"starting with 'snapshot-{today}-'")
with open(output_filepath, 'w') as md_file:
md_file.write(f'# URLs from Sidebery Snapshots for '
f'{today.replace(".", "-")}\n\n')
files_processed = 0
for filename in sorted(os.listdir(base_dir)):
# Debugging print removed
if (filename.startswith(f'snapshot-{today}-')
and filename.endswith('.json')):
files_processed += 1
print(f"Processing file: "
f"{os.path.join(base_dir, filename)}")
# Extract and format date and time from filename
# Example: snapshot-2026.01.25-13.19.29.json
clean_filename = (filename.replace('snapshot-', '')
.replace('.json', ''))
date_time_parts = clean_filename.split('-', 1)
formatted_date = date_time_parts[0].replace('.', '-')
formatted_time = date_time_parts[1].replace('.', ':')
datetime_str = f"{formatted_date} {formatted_time}"
md_file.write(f'## {datetime_str}\n\n')
with open(os.path.join(base_dir, filename), 'r') as json_file:
try:
data = json.load(json_file)
urls = find_urls(data)
print(f" Found {len(urls)} URLs")
for url in urls:
if url not in url_titles:
try:
# Get title of URL
res = requests.get(
url,
timeout=10,
allow_redirects=True
)
soup = BeautifulSoup(
res.text,
'html.parser'
)
if soup.title and soup.title.string:
title = soup.title.string.strip()
else:
domain = urlparse(url).netloc
title = domain if domain else url
url_titles[url] = title
except requests.exceptions.InvalidSchema:
continue
except Exception:
domain = urlparse(url).netloc
title = domain if domain else url
url_titles[url] = title
if url in url_titles:
title = url_titles[url]
md_file.write(f'- [{title}]({url})\n')
md_file.write('\n')
except json.JSONDecodeError:
print(f" Error decoding JSON in {filename}")
md_file.write('- Error decoding JSON\n\n')
if files_processed == 0:
print("No files found for today.")
print(f"Processing complete. Output written to {output_filepath}")
if __name__ == '__main__':
main()