Files
dotfiles/platforms/nix-darwin/modules/nmasur/presets/services/daily-summary/process-urls.py

116 lines
4.4 KiB
Python

# Temporarily disabled in favor of using an extension to save history
import requests
from bs4 import BeautifulSoup
import json
import os
import sys
from datetime import datetime
from urllib.parse import urlparse
def find_urls(data):
urls = []
if isinstance(data, dict):
for key, value in data.items():
if key == 'url' and isinstance(value, str):
urls.append(value)
else:
urls.extend(find_urls(value))
elif isinstance(data, list):
for item in data:
urls.extend(find_urls(item))
return urls
def main():
if len(sys.argv) > 1:
base_dir = sys.argv[1]
if not os.path.isdir(base_dir):
print(f"Error: Directory '{base_dir}' not found.")
sys.exit(1)
else:
base_dir = '.'
today = datetime.now().strftime('%Y.%m.%d')
output_filename = 'todays_urls.md'
output_filepath = os.path.join(base_dir, output_filename)
url_titles = {}
print(f"Searching for files in '{base_dir}' "
f"starting with 'snapshot-{today}-'")
with open(output_filepath, 'w') as md_file:
md_file.write(f'# URLs from Sidebery Snapshots for '
f'{today.replace(".", "-")}\n\n')
files_processed = 0
for filename in sorted(os.listdir(base_dir)):
# Debugging print removed
if (filename.startswith(f'snapshot-{today}-')
and filename.endswith('.json')):
files_processed += 1
print(f"Processing file: "
f"{os.path.join(base_dir, filename)}")
# Extract and format date and time from filename
# Example: snapshot-2026.01.25-13.19.29.json
clean_filename = (filename.replace('snapshot-', '')
.replace('.json', ''))
date_time_parts = clean_filename.split('-', 1)
formatted_date = date_time_parts[0].replace('.', '-')
formatted_time = date_time_parts[1].replace('.', ':')
datetime_str = f"{formatted_date} {formatted_time}"
md_file.write(f'## {datetime_str}\n\n')
with open(os.path.join(base_dir, filename), 'r') as json_file:
try:
data = json.load(json_file)
urls = find_urls(data)
print(f" Found {len(urls)} URLs")
for url in urls:
if url not in url_titles:
try:
# Get title of URL
res = requests.get(
url,
timeout=10,
allow_redirects=True
)
soup = BeautifulSoup(
res.text,
'html.parser'
)
if soup.title and soup.title.string:
title = soup.title.string.strip()
else:
domain = urlparse(url).netloc
title = domain if domain else url
url_titles[url] = title
except requests.exceptions.InvalidSchema:
continue
except Exception:
domain = urlparse(url).netloc
title = domain if domain else url
url_titles[url] = title
if url in url_titles:
title = url_titles[url]
md_file.write(f'- [{title}]({url})\n')
md_file.write('\n')
except json.JSONDecodeError:
print(f" Error decoding JSON in {filename}")
md_file.write('- Error decoding JSON\n\n')
if files_processed == 0:
print("No files found for today.")
print(f"Processing complete. Output written to {output_filepath}")
if __name__ == '__main__':
main()