mirror of
https://github.com/nmasur/dotfiles
synced 2026-02-05 13:29:45 +00:00
116 lines
4.4 KiB
Python
116 lines
4.4 KiB
Python
# Temporarily disabled in favor of using an extension to save history
|
|
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
def find_urls(data):
|
|
urls = []
|
|
if isinstance(data, dict):
|
|
for key, value in data.items():
|
|
if key == 'url' and isinstance(value, str):
|
|
urls.append(value)
|
|
else:
|
|
urls.extend(find_urls(value))
|
|
elif isinstance(data, list):
|
|
for item in data:
|
|
urls.extend(find_urls(item))
|
|
return urls
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) > 1:
|
|
base_dir = sys.argv[1]
|
|
if not os.path.isdir(base_dir):
|
|
print(f"Error: Directory '{base_dir}' not found.")
|
|
sys.exit(1)
|
|
else:
|
|
base_dir = '.'
|
|
|
|
today = datetime.now().strftime('%Y.%m.%d')
|
|
output_filename = 'todays_urls.md'
|
|
output_filepath = os.path.join(base_dir, output_filename)
|
|
url_titles = {}
|
|
|
|
print(f"Searching for files in '{base_dir}' "
|
|
f"starting with 'snapshot-{today}-'")
|
|
|
|
with open(output_filepath, 'w') as md_file:
|
|
md_file.write(f'# URLs from Sidebery Snapshots for '
|
|
f'{today.replace(".", "-")}\n\n')
|
|
|
|
files_processed = 0
|
|
for filename in sorted(os.listdir(base_dir)):
|
|
# Debugging print removed
|
|
if (filename.startswith(f'snapshot-{today}-')
|
|
and filename.endswith('.json')):
|
|
files_processed += 1
|
|
print(f"Processing file: "
|
|
f"{os.path.join(base_dir, filename)}")
|
|
|
|
# Extract and format date and time from filename
|
|
# Example: snapshot-2026.01.25-13.19.29.json
|
|
clean_filename = (filename.replace('snapshot-', '')
|
|
.replace('.json', ''))
|
|
date_time_parts = clean_filename.split('-', 1)
|
|
|
|
formatted_date = date_time_parts[0].replace('.', '-')
|
|
formatted_time = date_time_parts[1].replace('.', ':')
|
|
|
|
datetime_str = f"{formatted_date} {formatted_time}"
|
|
|
|
md_file.write(f'## {datetime_str}\n\n')
|
|
|
|
with open(os.path.join(base_dir, filename), 'r') as json_file:
|
|
try:
|
|
data = json.load(json_file)
|
|
urls = find_urls(data)
|
|
print(f" Found {len(urls)} URLs")
|
|
for url in urls:
|
|
if url not in url_titles:
|
|
try:
|
|
# Get title of URL
|
|
res = requests.get(
|
|
url,
|
|
timeout=10,
|
|
allow_redirects=True
|
|
)
|
|
soup = BeautifulSoup(
|
|
res.text,
|
|
'html.parser'
|
|
)
|
|
if soup.title and soup.title.string:
|
|
title = soup.title.string.strip()
|
|
else:
|
|
domain = urlparse(url).netloc
|
|
title = domain if domain else url
|
|
url_titles[url] = title
|
|
except requests.exceptions.InvalidSchema:
|
|
continue
|
|
except Exception:
|
|
domain = urlparse(url).netloc
|
|
title = domain if domain else url
|
|
url_titles[url] = title
|
|
if url in url_titles:
|
|
title = url_titles[url]
|
|
md_file.write(f'- [{title}]({url})\n')
|
|
md_file.write('\n')
|
|
except json.JSONDecodeError:
|
|
print(f" Error decoding JSON in {filename}")
|
|
md_file.write('- Error decoding JSON\n\n')
|
|
|
|
if files_processed == 0:
|
|
print("No files found for today.")
|
|
|
|
print(f"Processing complete. Output written to {output_filepath}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|