mirror of
https://github.com/nmasur/dotfiles
synced 2026-02-05 20:29:46 +00:00
enable daily browser summary on darwin
This commit is contained in:
@@ -0,0 +1,112 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def find_urls(data):
|
||||
urls = []
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
if key == 'url' and isinstance(value, str):
|
||||
urls.append(value)
|
||||
else:
|
||||
urls.extend(find_urls(value))
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
urls.extend(find_urls(item))
|
||||
return urls
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) > 1:
|
||||
base_dir = sys.argv[1]
|
||||
if not os.path.isdir(base_dir):
|
||||
print(f"Error: Directory '{base_dir}' not found.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
base_dir = '.'
|
||||
|
||||
today = datetime.now().strftime('%Y.%m.%d')
|
||||
output_filename = 'todays_urls.md'
|
||||
output_filepath = os.path.join(base_dir, output_filename)
|
||||
url_titles = {}
|
||||
|
||||
print(f"Searching for files in '{base_dir}' "
|
||||
f"starting with 'snapshot-{today}-'")
|
||||
|
||||
with open(output_filepath, 'w') as md_file:
|
||||
md_file.write(f'# URLs from Sidebery Snapshots for '
|
||||
f'{today.replace(".", "-")}\n\n')
|
||||
|
||||
files_processed = 0
|
||||
for filename in sorted(os.listdir(base_dir)):
|
||||
# Debugging print removed
|
||||
if (filename.startswith(f'snapshot-{today}-')
|
||||
and filename.endswith('.json')):
|
||||
files_processed += 1
|
||||
print(f"Processing file: "
|
||||
f"{os.path.join(base_dir, filename)}")
|
||||
|
||||
# Extract and format date and time from filename
|
||||
# Example: snapshot-2026.01.25-13.19.29.json
|
||||
clean_filename = (filename.replace('snapshot-', '')
|
||||
.replace('.json', ''))
|
||||
date_time_parts = clean_filename.split('-', 1)
|
||||
|
||||
formatted_date = date_time_parts[0].replace('.', '-')
|
||||
formatted_time = date_time_parts[1].replace('.', ':')
|
||||
|
||||
datetime_str = f"{formatted_date} {formatted_time}"
|
||||
|
||||
md_file.write(f'## {datetime_str}\n\n')
|
||||
|
||||
with open(os.path.join(base_dir, filename), 'r') as json_file:
|
||||
try:
|
||||
data = json.load(json_file)
|
||||
urls = find_urls(data)
|
||||
print(f" Found {len(urls)} URLs")
|
||||
for url in urls:
|
||||
if url not in url_titles:
|
||||
try:
|
||||
# Get title of URL
|
||||
res = requests.get(
|
||||
url,
|
||||
timeout=10,
|
||||
allow_redirects=True
|
||||
)
|
||||
soup = BeautifulSoup(
|
||||
res.text,
|
||||
'html.parser'
|
||||
)
|
||||
if soup.title and soup.title.string:
|
||||
title = soup.title.string.strip()
|
||||
else:
|
||||
domain = urlparse(url).netloc
|
||||
title = domain if domain else url
|
||||
url_titles[url] = title
|
||||
except requests.exceptions.InvalidSchema:
|
||||
continue
|
||||
except Exception:
|
||||
domain = urlparse(url).netloc
|
||||
title = domain if domain else url
|
||||
url_titles[url] = title
|
||||
if url in url_titles:
|
||||
title = url_titles[url]
|
||||
md_file.write(f'- [{title}]({url})\n')
|
||||
md_file.write('\n')
|
||||
except json.JSONDecodeError:
|
||||
print(f" Error decoding JSON in {filename}")
|
||||
md_file.write('- Error decoding JSON\n\n')
|
||||
|
||||
if files_processed == 0:
|
||||
print("No files found for today.")
|
||||
|
||||
print(f"Processing complete. Output written to {output_filepath}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user