diff --git a/.gitignore b/.gitignore index 49f2aebc..406bb55b 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,6 @@ cython_debug/ # Custom gitignore *.db # End of custom ignore + +*.csv +*.xlsx diff --git a/3rdparty/py/requirements-all.txt b/3rdparty/py/requirements-all.txt index 64830120..2694b8e3 100644 --- a/3rdparty/py/requirements-all.txt +++ b/3rdparty/py/requirements-all.txt @@ -1,3 +1,5 @@ +aiohttp==3.9.3 +backoff==2.2.1 boto3==1.40.23 celery==5.5.3 dj-rest-auth==7.0.1 @@ -17,8 +19,12 @@ greenlet==3.2.4 gunicorn[gevent, setproctitle]==23.0.0 html2text==2025.4.15 lxml==6.0.1 +openpyxl==3.1.5 +pandas==2.3.0 +pyairtable==2.3.3 redis==6.4.0 requests==2.32.5 +scrapy==2.12.0 sentry-sdk==2.36.0 tablib[xlsx]==3.8.0 trafilatura==1.12.2 diff --git a/content_access_bot/.env.example b/content_access_bot/.env.example new file mode 100644 index 00000000..4801ab9b --- /dev/null +++ b/content_access_bot/.env.example @@ -0,0 +1,5 @@ +AIRTABLE_BASE_ID= +AIRTABLE_API_KEY= +AIRTABLE_ORGANISATION_TABLE= +AIRTABLE_CONTENT_TABLE= +DB_FILE=content_access_bot.db diff --git a/content_access_bot/docker/BUILD b/content_access_bot/docker/BUILD new file mode 100644 index 00000000..7f61598f --- /dev/null +++ b/content_access_bot/docker/BUILD @@ -0,0 +1,33 @@ +python_sources() +docker_image( + name="content_access_bot-deps", + image_tags=["deps"], + build_platform=["linux/amd64", "linux/arm64"], + registries=["content_access_bot"], + repository="app", + skip_push=True, + source="Dockerfile.deps", +) + +file(name="app.json", source="app.json") + +docker_image( + name="content_access_bot-srcs", + image_tags=["srcs"], + build_platform=["linux/amd64", "linux/arm64"], + registries=["content_access_bot"], + repository="app", + skip_push=True, + source="Dockerfile.srcs", +) + +docker_image( + name="content_access_bot", + build_platform=["linux/amd64", "linux/arm64"], + dependencies=[":content_access_bot-srcs", ":content_access_bot-deps", ":app.json"], + image_tags=[ + "{build_args.VERSION}", + "latest", + ], + source="Dockerfile", +) diff --git a/content_access_bot/docker/Dockerfile b/content_access_bot/docker/Dockerfile new file mode 100644 index 00000000..f208c837 --- /dev/null +++ b/content_access_bot/docker/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.11-slim-bookworm AS python-base +FROM content_access_bot/app:deps AS app-deps +FROM content_access_bot/app:srcs AS app-srcs +FROM python-base AS python-app + +WORKDIR /app +COPY content_access_bot/docker/app.json ./ +COPY --from=app-deps /app ./ +COPY --from=app-srcs /app ./ + +CMD ["tail", "-f", "/dev/null"] diff --git a/content_access_bot/docker/Dockerfile.deps b/content_access_bot/docker/Dockerfile.deps new file mode 100644 index 00000000..3f502fc9 --- /dev/null +++ b/content_access_bot/docker/Dockerfile.deps @@ -0,0 +1,4 @@ +FROM python:3.11-slim-bookworm + +COPY content_access_bot.py/content_access_bot-deps@environment=linux.pex /content_access_bot-deps.pex +RUN PEX_TOOLS=1 python /content_access_bot-deps.pex venv --scope=deps --compile /app diff --git a/content_access_bot/docker/Dockerfile.srcs b/content_access_bot/docker/Dockerfile.srcs new file mode 100644 index 00000000..9a280e85 --- /dev/null +++ b/content_access_bot/docker/Dockerfile.srcs @@ -0,0 +1,4 @@ +FROM python:3.11-slim-bookworm + +COPY content_access_bot.py/content_access_bot-srcs@environment=linux.pex /content_access_bot-srcs.pex +RUN PEX_TOOLS=1 python /content_access_bot-srcs.pex venv --scope=srcs --compile /app diff --git a/content_access_bot/docker/app.json b/content_access_bot/docker/app.json new file mode 100644 index 00000000..5b55346a --- /dev/null +++ b/content_access_bot/docker/app.json @@ -0,0 +1,9 @@ +{ + "name": "content_access_bot", + "cron": [ + { + "command": "./pex", + "schedule": "@daily" + } + ] +} diff --git a/content_access_bot/py/BUILD b/content_access_bot/py/BUILD new file mode 100644 index 00000000..e6e49b14 --- /dev/null +++ b/content_access_bot/py/BUILD @@ -0,0 +1,46 @@ +python_sources( + name="lib", + dependencies=[ + "3rdparty/py:requirements-all#aiohttp", + "3rdparty/py:requirements-all#backoff", + "3rdparty/py:requirements-all#environs", + "3rdparty/py:requirements-all#pyairtable", + "3rdparty/py:requirements-all#scrapy", + "3rdparty/py:requirements-all#openpyxl", + "3rdparty/py:requirements-all#pandas", + "content_access_bot/py/pipeline.py:lib" + ], +) + +pex_binary( + name="content_access_bot-deps", + environment=parametrize("__local__", "linux"), + dependencies=[ + ":lib", + ], + entry_point="main.py", + include_sources=False, + include_tools=True, + layout="packed", +) + +pex_binary( + name="content_access_bot-srcs", + environment=parametrize("__local__", "linux"), + dependencies=[ + ":lib", + ], + entry_point="main.py", + include_requirements=False, + include_tools=True, + layout="packed", +) + + +pex_binary( + name="content_access_bot", + dependencies=[ + ":lib", + ], + entry_point="main.py", +) diff --git a/content_access_bot/py/VERSION b/content_access_bot/py/VERSION new file mode 100644 index 00000000..8acdd82b --- /dev/null +++ b/content_access_bot/py/VERSION @@ -0,0 +1 @@ +0.0.1 diff --git a/content_access_bot/py/airtable.py b/content_access_bot/py/airtable.py new file mode 100644 index 00000000..013f8867 --- /dev/null +++ b/content_access_bot/py/airtable.py @@ -0,0 +1,85 @@ +from pyairtable import Api +from utils import validate_url, clean_url +import os +import logging +import re +from environs import Env +env = Env() +dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env') + +env.read_env(dotenv_path) + + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + +api_key = os.getenv('AIRTABLE_API_KEY') +base_id = os.getenv('AIRTABLE_BASE_ID') +organisations_table = os.getenv('AIRTABLE_ORGANISATION_TABLE') +content_table = os.getenv('AIRTABLE_CONTENT_TABLE') + +if not api_key or not base_id or not organisations_table or not content_table: + raise ValueError('API key, base ID and Organisation table are required') + +at = Api(api_key) + + +def get_table_data(table_name, formula=None, fields=None): + if not base_id: + logging.error(f"AIRTABLE_BASE_ID Not Provided") + return + table = at.table(base_id, table_name) + return table.all(formula=formula, fields=fields) + + +def get_formula(allowed_countries=None): + base_formula = 'AND(NOT({Organisation Name} = ""), NOT({Website} = ""), NOT({HQ Country} = ""))' + if allowed_countries: + countries_formula = ', '.join( + [f'({{HQ Country}} = "{country}")' for country in allowed_countries]) + formula = f'AND({base_formula}, OR({countries_formula}))' + else: + formula = base_formula + return formula + + +def process_records(data): + organizations = [] + for record in data: + website = validate_url(record['fields'].get('Website', None)) + name = record['fields'].get('Organisation Name', None) + country = record['fields'].get('HQ Country', None) + id: str = record['id'] + if website: + org = {} + org['id'] = id + org['name'] = re.sub( + r'[\\/*?:"<>|]', '-', name) if name else None + org['url'] = clean_url(website) + org['country'] = country + + organizations.append(org) + return organizations + + +def get_organizations(allowed_countries=None): + logging.info('Fetching organizations from Airtable') + formula = get_formula(allowed_countries) + fields = ['Organisation Name', 'Website', 'HQ Country'] + data = get_table_data(organisations_table, formula, fields) + organizations = process_records(data) + logging.info(f'Fetched {len(organizations)} organizations') + return organizations + + +async def batch_upsert_organizations(data): + logging.info('Upserting organizations in Airtable') + try: + if not base_id or not content_table: + logging.error(f"AIRTABLE_BASE_ID or AIRTABLE_CONTENT_TABLE Not Provided") + return + table = at.table(base_id, content_table) + table.batch_upsert(records=data, key_fields=['id',]) + logging.info('Organizations upserted successfully') + except Exception as e: + logging.error(f'Error upserting organization: {e}') diff --git a/content_access_bot/py/db.py b/content_access_bot/py/db.py new file mode 100644 index 00000000..3ad6e46c --- /dev/null +++ b/content_access_bot/py/db.py @@ -0,0 +1,340 @@ +import os +from environs import Env, env +from sqlite3 import Error, connect +import logging +from dataclasses import dataclass +from typing import Optional +from datetime import datetime + +env = Env() +dotenv_path = os.path.join(os.path.dirname(__file__), ".env") +env.read_env(dotenv_path) + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + +@dataclass +class MediaHouse: + name: str + country: str + url: str + airtable_id: str + id: Optional[str] = None + +@dataclass +class SiteCheck: + airtable_id: str + site_status: Optional[str] = None + site_reachable: Optional[bool] = None + site_redirect: Optional[bool] = None + final_url: Optional[str] = None + robots_url: Optional[str] = None + robots_timestamp: Optional[str] = None + robots_content: Optional[str] = None + robots_status: Optional[str] = None + check_timestamp: Optional[str] = None + +class Database: + def __init__(self): + self.db_file = os.getenv('DB_FILE') or 'media_data' + self.conn = self.create_connection() + self.create_table() + + def create_connection(self): + try: + conn = connect(self.db_file, timeout=30) + return conn + except Error as e: + logging.error(f"Error creating connection: {e}") + + def is_connected(self): + return self.conn is not None + + def create_table(self): + create_table_sql = """ + CREATE TABLE IF NOT EXISTS media_data ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + country TEXT NOT NULL, + url TEXT NOT NULL, + airtable_id TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS site_checks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + airtable_id TEXT NOT NULL, + site_status TEXT, + site_reachable BOOLEAN, + site_redirect BOOLEAN, + final_url TEXT, + robots_url TEXT, + robots_timestamp TEXT, + robots_content TEXT, + robots_status TEXT, + check_timestamp TEXT NOT NULL, + FOREIGN KEY(airtable_id) REFERENCES media_data(airtable_id) + ); + + CREATE TABLE IF NOT EXISTS internet_archive_snapshots( + id INTEGER PRIMARY KEY AUTOINCREMENT, + airtable_id TEXT NOT NULL, + url TEXT NOT NULL, + archive_date TEXT NOT NULL UNIQUE, + archive_robots_url TEXT, + archived_content TEXT, + archived_retrieval_date TEXT, + FOREIGN KEY(airtable_id) REFERENCES media_data(airtable_id) + ); + """ + try: + if self.conn is not None: + cursor = self.conn.cursor() + cursor.executescript(create_table_sql) + self.conn.commit() + logging.info("Database tables created or already exist.") + else: + logging.error("Database connection is not established. Table creation skipped.") + except Error as e: + logging.error(f"Error creating table: {e}") + + def insert_media_house(self, media_house: MediaHouse): + try: + sql = """ + INSERT OR IGNORE INTO media_data(name, country, url, airtable_id) + VALUES(?, ?, ?, ?) + """ + if self.conn is not None: + cur = self.conn.cursor() + cur.execute(sql, (media_house.name, media_house.country, + media_house.url, media_house.airtable_id)) + self.conn.commit() + return cur.lastrowid + else: + logging.error("Database connection is not established.") + except Error as e: + logging.error(f"Error inserting media house: {e}") + + def close_connection(self, cur): + if cur is not None: + cur.close() + + def select_media_houses_without_status(self): + """Legacy method - now returns media houses without recent checks""" + return self.select_media_houses_without_recent_check() + + def select_media_houses_without_recent_check(self, max_age_days=7): + """Select media houses that haven't been checked recently or never checked""" + cur = None + try: + if self.conn is not None: + cur = self.conn.cursor() + # Get media houses with no recent site checks + sql = """ + SELECT md.* FROM media_data md + LEFT JOIN ( + SELECT airtable_id, MAX(check_timestamp) as latest_check + FROM site_checks + GROUP BY airtable_id + ) sc ON md.airtable_id = sc.airtable_id + WHERE sc.latest_check IS NULL + OR datetime(sc.latest_check) < datetime('now', '-{} days') + """.format(max_age_days) + cur.execute(sql) + rows = cur.fetchall() + column_names = [column[0] for column in cur.description] + dict_rows = [dict(zip(column_names, row)) for row in rows] + return dict_rows + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) + + def insert_site_check(self, site_check: SiteCheck): + """Insert a new site check record""" + cur = None + try: + if self.conn is not None: + sql = """ + INSERT INTO site_checks( + airtable_id, site_status, site_reachable, site_redirect, + final_url, robots_url, robots_timestamp, robots_content, + robots_status, check_timestamp + ) + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + cur = self.conn.cursor() + check_time = site_check.check_timestamp or datetime.now().strftime("%Y%m%d%H%M%S") + cur.execute(sql, ( + site_check.airtable_id, site_check.site_status, + site_check.site_reachable, site_check.site_redirect, + site_check.final_url, site_check.robots_url, + site_check.robots_timestamp, site_check.robots_content, + site_check.robots_status, check_time + )) + self.conn.commit() + return cur.lastrowid + except Error as e: + logging.error(f"Error inserting site check: {e}") + finally: + self.close_connection(cur) + + def update_site_status(self, airtable_id, site_status, site_reachable, site_redirect, final_url): + """Legacy method - now creates a new site check record for status update""" + site_check = SiteCheck( + airtable_id=airtable_id, + site_status=site_status, + site_reachable=site_reachable, + site_redirect=site_redirect, + final_url=final_url + ) + return self.insert_site_check(site_check) + + def insert_current_robots(self, airtable_id, robots_url, robots_timestamp, robots_content, robots_status): + """Legacy method - now creates a new site check record for robots update""" + site_check = SiteCheck( + airtable_id=airtable_id, + robots_url=robots_url, + robots_timestamp=robots_timestamp, + robots_content=robots_content, + robots_status=robots_status + ) + return self.insert_site_check(site_check) + + def get_all_media_houses(self): + cur = None + try: + if self.conn is not None: + cur = self.conn.cursor() + cur.execute("SELECT * FROM media_data") + rows = cur.fetchall() + column_names = [column[0] for column in cur.description] + dict_rows = [dict(zip(column_names, row)) for row in rows] + return dict_rows + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) + + def get_latest_site_checks(self): + """Get the most recent site check for each media house""" + cur = None + try: + if self.conn is not None: + cur = self.conn.cursor() + sql = """ + SELECT sc.* FROM site_checks sc + INNER JOIN ( + SELECT airtable_id, MAX(check_timestamp) as latest_check + FROM site_checks + GROUP BY airtable_id + ) latest ON sc.airtable_id = latest.airtable_id + AND sc.check_timestamp = latest.latest_check + """ + cur.execute(sql) + rows = cur.fetchall() + column_names = [column[0] for column in cur.description] + dict_rows = [dict(zip(column_names, row)) for row in rows] + return dict_rows + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) + + def insert_internet_archive_snapshot_url(self, airtable_id, url, archive_date): + try: + sql = """ + INSERT INTO internet_archive_snapshots(airtable_id, url, archive_date) + VALUES(?, ?, ?) + """ + if self.conn is not None: + cur = self.conn.cursor() + cur.execute(sql, (airtable_id, url, archive_date)) + self.conn.commit() + return cur.lastrowid + else: + logging.error("Database connection is not established.") + except Error as e: + logging.error(f"Error inserting archive snapshot: {e}") + + def get_all_internet_archive_snapshots(self): + cur = None + try: + if self.conn is not None: + cur = self.conn.cursor() + cur.execute("SELECT * FROM internet_archive_snapshots") + rows = cur.fetchall() + column_names = [column[0] for column in cur.description] + dict_rows = [dict(zip(column_names, row)) for row in rows] + return dict_rows + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) + + def insert_internet_archive_snapshot_robots(self, id, archive_robots_url, archived_content, archived_retrieval_date): + cur = None + try: + if self.conn is not None: + sql = """ + UPDATE internet_archive_snapshots + SET archive_robots_url = ?, archived_content = ?, archived_retrieval_date = ? + WHERE id = ? + """ + cur = self.conn.cursor() + cur.execute(sql, (archive_robots_url, archived_content, archived_retrieval_date, id)) + self.conn.commit() + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) + + def get_combided_data(self): + """Legacy method name - calls get_combined_data""" + return self.get_combined_data() + + def get_combined_data(self): + """Get media data with latest site checks and archive snapshots""" + cur = None + try: + if self.conn is not None: + cur = self.conn.cursor() + # Get media data with latest site check + sql = """ + SELECT + md.*, + sc.site_status, sc.site_reachable, sc.site_redirect, sc.final_url, + sc.robots_url, sc.robots_timestamp, sc.robots_content, sc.robots_status, + sc.check_timestamp + FROM media_data md + LEFT JOIN ( + SELECT sc1.* FROM site_checks sc1 + INNER JOIN ( + SELECT airtable_id, MAX(check_timestamp) as latest_check + FROM site_checks + GROUP BY airtable_id + ) sc2 ON sc1.airtable_id = sc2.airtable_id + AND sc1.check_timestamp = sc2.latest_check + ) sc ON md.airtable_id = sc.airtable_id + """ + cur.execute(sql) + media_rows = cur.fetchall() + media_columns = [column[0] for column in cur.description] + combined = [] + + for media_row in media_rows: + media_dict = dict(zip(media_columns, media_row)) + # Get all snapshots for this airtable_id + cur.execute( + "SELECT * FROM internet_archive_snapshots WHERE airtable_id = ?", + (media_dict["airtable_id"],) + ) + snapshot_rows = cur.fetchall() + snapshot_columns = [column[0] for column in cur.description] + snapshots = [dict(zip(snapshot_columns, row)) for row in snapshot_rows] + media_dict["snapshots"] = snapshots + combined.append(media_dict) + return combined + except Error as e: + logging.error(f"Error: {e}") + finally: + self.close_connection(cur) diff --git a/content_access_bot/py/diff.py b/content_access_bot/py/diff.py new file mode 100644 index 00000000..eefaeea7 --- /dev/null +++ b/content_access_bot/py/diff.py @@ -0,0 +1,68 @@ +ai_crawlers = [ + "Amazonbot", + "anthropic-ai", + "AwarioRssBot", + "AwarioSmartBot", + "Bard", + "Bloom", + "Bytespider", + "CCBot", + "ChatGPT", + "ChatGPT-User", + "ClaudeBot", + "Claude-Web", + "cohere-ai" + "DataForSeoBot", + "Diffbot", + "FacebookBot", + "GPT-4", + "GPT-Neo", + "GPTBot", + "Google-Extended", + "GoogleOther", + "HuggingFace-Transformers", + "LaMDA", + "Megatron-Turing-NLG", + "magpie-crawler", + "Meltwater", + "NewsNow", + "news-please", + "omgili", + "OmigiliBot", + "PaLM", + "peer39_crawler", + "peer39_crawler/1.0", + "PerplexityBot" + "TurnitinBot", + "Seekr", + "Scrapy", + "Wu-Dao-2.0", +] + + +def diff_robot_content(current_robots_content: str, archived_robots_content: str): + """ + Compares two robots.txt contents. + Returns: + - blocks_crawlers: True if current robots.txt blocks any AI crawlers + - blocked_crawlers: List of AI crawlers blocked in current robots.txt + - ai_blocking_update: True if current robots.txt blocks AI crawlers but archived did not + """ + current_content = current_robots_content or "" + archived_content = archived_robots_content or "" + + blocked_crawlers = [ + crawler for crawler in ai_crawlers if crawler.casefold() in current_content.casefold() + ] + previously_blocked_crawlers = [ + crawler for crawler in ai_crawlers if crawler.casefold() in archived_content.casefold() + ] + + blocks_crawlers = bool(blocked_crawlers) + ai_blocking_update = blocks_crawlers and not previously_blocked_crawlers + + return { + "blocks_crawlers": blocks_crawlers, + "blocked_crawlers": ', '.join(blocked_crawlers), + "ai_blocking_update": ai_blocking_update, + } diff --git a/content_access_bot/py/main.py b/content_access_bot/py/main.py new file mode 100644 index 00000000..f49eba82 --- /dev/null +++ b/content_access_bot/py/main.py @@ -0,0 +1,247 @@ +from datetime import datetime, timedelta +import logging +import time +import logging +import asyncio +from airtable import get_organizations, batch_upsert_organizations +from scrapy.crawler import CrawlerProcess +import pandas as pd +from db import Database, MediaHouse, SiteCheck +from diff import diff_robot_content +from spider import ArchivedRobotsSpider, ArchivedURLsSpider, RobotsSpider +from utils import check_site_availability, get_robots_url,find_closest_snapshot,format_db_date + + +MAX_ROBOTS_AGE = 7 # No of Days to skip fetching of current robots +MAX_INTERNATE_ARCHIVE_AGE =365 + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + +async def fetch_orgs(db:Database): + organizations = get_organizations() + for media_house in organizations: + media_house_obj = MediaHouse( + media_house['name'], media_house['country'], media_house['url'], media_house['id'] + ) + db.insert_media_house(media_house_obj) + + +async def check_org_sites(db:Database): + unchecked_orgs = db.select_media_houses_without_recent_check() + if not unchecked_orgs: + logging.info(f"No sites to check") + return + count = len(unchecked_orgs) if unchecked_orgs is not None else 0 + logging.info(f"Checking {count} sites") + + async def update_org_site(org): + site_status = await check_site_availability(org['url']) + db.update_site_status( + org['airtable_id'], site_status['status_code'], + site_status['reachable'], site_status['redirect'], site_status['final_url'] + ) + #TODO:Use Spider to check sites + await asyncio.gather(*(update_org_site(org) for org in unchecked_orgs)) + logging.info("Finished checking Sites") + + +async def fetch_robots(db: Database): + all_media_houses = db.get_all_media_houses() + if not all_media_houses: + logging.info(f"No sites to check") + return + + # Get latest site checks to determine which need robot fetching + latest_checks = db.get_latest_site_checks() + check_dict = {check['airtable_id']: check for check in latest_checks} + + filtered_media_houses = [] + today = datetime.now() + + for media_house in all_media_houses: + airtable_id = media_house['airtable_id'] + latest_check = check_dict.get(airtable_id) + + if not latest_check or not latest_check.get('robots_content'): + filtered_media_houses.append(media_house) + continue + + robots_timestamp = latest_check.get('robots_timestamp') + if robots_timestamp: + try: + robots_date = datetime.strptime(robots_timestamp, "%Y%m%d%H%M%S") + if (today - robots_date).days > MAX_ROBOTS_AGE: + filtered_media_houses.append(media_house) + except Exception as e: + logging.warning(f"Invalid robots_timestamp for {airtable_id}: {robots_timestamp}") + + count = len(filtered_media_houses) + if count == 0: + logging.info("No robots to fetch within the specified timeframe.") + return + + logging.info(f"Fetching Robots for {count} sites") + urls = [(media_house['airtable_id'], get_robots_url(media_house['url'])) + for media_house in filtered_media_houses] + process = CrawlerProcess(settings={ + 'ITEM_PIPELINES': { + 'pipeline.RobotsDatabasePipeline': 1 + }, + }, install_root_handler=False) + process.crawl(RobotsSpider, urls) + process.start() + +async def fetch_internet_archive_snapshots(db: Database): + logging.info("fetch_internet_archive_snapshots") + all_media_houses = db.get_all_media_houses() + if not all_media_houses: + logging.info(f"No sites to fetch internet archive snapshots") + return + + all_archive_snapshots = db.get_all_internet_archive_snapshots() + # Get set of airtable_ids that already have snapshots + fetched_airtable_ids = set(s['airtable_id'] for s in all_archive_snapshots) if all_archive_snapshots else set() + # Filter only media houses not yet fetched + to_fetch = [media_house for media_house in all_media_houses if media_house['airtable_id'] not in fetched_airtable_ids] + + count = len(to_fetch) + if count == 0: + logging.info("No new sites to fetch internet archive snapshots for.") + return + + logging.info(f"Fetching Internet Archive snapshots for {count} sites") + target_date = (datetime.now() - timedelta(days=MAX_INTERNATE_ARCHIVE_AGE)).strftime("%Y%m%d%H%M%S") + urls = [(media_house['airtable_id'], media_house['url']) for media_house in to_fetch] + process = CrawlerProcess(settings={ + 'ITEM_PIPELINES': { + 'pipeline.ArchivedURLsDatabasePipeline': 2 + }, + }, install_root_handler=False) + process.crawl(ArchivedURLsSpider, urls=urls, target_date=target_date) + process.start() + +async def fetch_archived_robots(db:Database): + logging.info("Fetching Archived Robots.tx") + all_archived_snapshot_url = db.get_all_internet_archive_snapshots() + if not all_archived_snapshot_url: + logging.info(f"No sites to fetch internet archive snapshots") + return + + today = datetime.now() + filtered_snapshots = [] + for snapshot in all_archived_snapshot_url: + archived_content = snapshot.get('archived_content') + archived_retrieval_date = snapshot.get('archived_retrieval_date') + # If archived_content is None or empty, always include + if not archived_content: + filtered_snapshots.append(snapshot) + continue + # If archived_retrieval_date exists, check if it's older than MAX_ROBOTS_AGE days + if archived_retrieval_date: + try: + retrieval_date = datetime.strptime(archived_retrieval_date, "%Y%m%d%H%M%S") + if (today - retrieval_date).days > MAX_ROBOTS_AGE: + filtered_snapshots.append(snapshot) + except Exception as e: + logging.warning(f"Invalid archived_retrieval_date for {snapshot.get('id')}: {archived_retrieval_date}") + + count = len(filtered_snapshots) + if count == 0: + logging.info("No archived robots to fetch within the specified timeframe.") + return + + logging.info(f"Fetching Robots for {count} sites") + urls = [(snapshot['id'], f"{snapshot['url']}/robots.txt") + for snapshot in filtered_snapshots] + process = CrawlerProcess(settings={ + 'ITEM_PIPELINES': { + 'pipeline.ArchivedRobotsDatabasePipeline': 3 + }, + }, install_root_handler=False) + process.crawl(ArchivedRobotsSpider, urls) + process.start() + + +async def generate_report(db: Database): + combined_data = db.get_combined_data() + if not combined_data: + logging.info("No Data to generate report from") + return + target_date = (datetime.now() - timedelta(days=MAX_INTERNATE_ARCHIVE_AGE)).strftime("%Y%m%d%H%M%S") + report_rows = [] + + for media in combined_data: + snapshots = media.get("snapshots", []) + closest_snapshot = find_closest_snapshot(snapshots, target_date,date_key="archive_date") + archived_content = "" + row = { + "Name": media.get("name"), + "Country": media.get("country"), + "URL": media.get("url"), + "Airtable ID": media.get("airtable_id"), + "Site Status": media.get("site_status"), + "Site Reachable": bool(media.get("site_reachable")), + "Site Redirect": bool(media.get("site_redirect")), + "Final URL": media.get("final_url"), + "Robots URL": media.get("robots_url"), + "Date Robots Fetched": format_db_date(media.get("robots_timestamp")), + "Robot Content": ( + "''" if media.get("robots_content") == "" else media.get("robots_content") + ), + "Robot Status": media.get("robots_status"), + } + if closest_snapshot: + row.update({ + "Archive URL": closest_snapshot.get("url"), + "Archive Date": format_db_date(closest_snapshot.get("archive_date")), + "Archive Robots URL": closest_snapshot.get("archive_robots_url"), + "Archive Robot Content": ( + "''" if closest_snapshot.get("archive_robots_url") == "" else closest_snapshot.get("archive_robots_url") + ), + "Archive Retrievel Date": format_db_date(closest_snapshot.get("archived_retrieval_date")), + }) + archived_content = closest_snapshot.get("archived_content") + else: + row.update({ + "Archive URL": None, + "Archive Date": None, + "Archive Robots URL": None, + "Archive Robot Content": None, + "Archive Retrievel Date": None, + }) + report_rows.append(row) + + diff_data = diff_robot_content(media.get("robots_content"),archived_content) + + row.update(({ + "Blocks AI Crawlers": diff_data['blocks_crawlers'], + "Blocked AI Crawler": diff_data['blocked_crawlers'], + "Update Robots to block AI":diff_data['ai_blocking_update'] + })) + + + df = pd.DataFrame(report_rows) + filename = f"Report-{target_date}.xlsx" + df.to_excel(filename, index=False) + +async def main(db:Database): + await fetch_orgs(db) + await check_org_sites(db) # Often Not Required unless site status is required + await fetch_robots(db) + await fetch_internet_archive_snapshots(db) + await fetch_archived_robots(db) + await generate_report(db) + + +if __name__ == "__main__": + try: + start_time = time.time() + db = Database() + if not db.is_connected(): + logging.error("Failed to connect to the database") + exit(1) + asyncio.run(main(db)) + except Exception as e: + logging.error(f"An error occurred: {e}") + diff --git a/content_access_bot/py/pipeline.py b/content_access_bot/py/pipeline.py new file mode 100644 index 00000000..45d9ad5d --- /dev/null +++ b/content_access_bot/py/pipeline.py @@ -0,0 +1,47 @@ + +from db import Database + + +class RobotsDatabasePipeline: + def __init__(self): + self.db = Database() + + def process_item(self, item, spider): + self.db.insert_current_robots( + item["airtable_id"], + item["robots_url"], + item["robots_timestamp"], + item["robots_content"], + item["robots_status"] + ) + return item + +class ArchivedURLsDatabasePipeline: + def __init__(self): + self.db = Database() + + def process_item(self, item, spider): + # Save the archived URL to the DB + self.db.insert_internet_archive_snapshot_url( + item["airtable_id"], + item["url"], + item["archive_date"] + ) + return item + +class ArchivedRobotsDatabasePipeline: + def __init__(self): + self.db = Database() + + def process_item(self, item, spider): + # Save the archived robots to the DB + print("ArchivedRobotsDatabasePipeline:", item) + self.db.insert_internet_archive_snapshot_robots( + item["id"], + item["archive_robots_url"], + item["archived_content"], + item["archived_retrieval_date"] + ) + return item + + \ No newline at end of file diff --git a/content_access_bot/py/spider.py b/content_access_bot/py/spider.py new file mode 100644 index 00000000..4087a6ba --- /dev/null +++ b/content_access_bot/py/spider.py @@ -0,0 +1,103 @@ +import datetime +import scrapy + +from utils import find_closest_snapshot + + + +class RobotsSpider(scrapy.Spider): + name = 'robots' + start_urls = [] + + def __init__(self, urls=None, *args, **kwargs): + super(RobotsSpider, self).__init__(*args, **kwargs) + if urls: + self.start_urls = urls + + def start_requests(self): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + + for airtable_id, url in self.start_urls: + yield scrapy.Request(url=url, callback=self.parse, meta={'airtable_id': airtable_id}, headers=headers) + + def parse(self, response): + yield { + "airtable_id":response.meta['airtable_id'], + "robots_url": response.url, + "robots_timestamp": datetime.datetime.now().strftime("%Y%m%d%H%M%S"), + "robots_content":response.text, + "robots_status":response.status + } +class ArchivedURLsSpider(scrapy.Spider): + name = 'archived_urls' + start_urls = [] + + def __init__(self, urls=None, target_date=None, *args, **kwargs): + super().__init__(*args, **kwargs) + if urls: + self.start_urls = urls + # target_date should be a string like "20230618000000" + self.target_date = target_date or (datetime.datetime.now() - datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S") + + def start_requests(self): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + for airtable_id, url in self.start_urls: + cdx_url = f"https://web.archive.org/cdx/search/cdx?url={url}" + yield scrapy.Request( + url=cdx_url, + callback=self.parse_cdx, + meta={'airtable_id': airtable_id, 'url':url}, + headers=headers + ) + + def parse_cdx(self, response): + url = response.meta['url'] + airtable_id = response.meta['airtable_id'] + lines = response.text.strip().split("\n") + snapshots = [] + for line in lines: + fields = line.split(" ") + if len(fields) == 7: + timestamp = fields[1] + status= fields[4] + snapshots.append({ + "url": f"https://web.archive.org/web/{timestamp}if_/{url}", + "timestamp": timestamp, + }) + closest = find_closest_snapshot(snapshots, self.target_date) + print("Closest Snapshot:", closest) + if closest: + yield { + "airtable_id": airtable_id, + "url": closest['url'], + "archive_date": closest["timestamp"] + } + +class ArchivedRobotsSpider(scrapy.Spider): + name = 'archived_robots' + start_urls = [] + + def __init__(self, urls=None, *args, **kwargs): + super(ArchivedRobotsSpider, self).__init__(*args, **kwargs) + if urls: + self.start_urls = urls + + def start_requests(self): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + + for id, url in self.start_urls: + yield scrapy.Request(url=url, callback=self.parse, meta={'id': id}, headers=headers) + + def parse(self, response): + yield { + "id": response.meta['id'], + "archive_robots_url":response.url, + "archived_content":response.text, + "archived_retrieval_date":datetime.datetime.now().strftime("%Y%m%d%H%M%S") + } diff --git a/content_access_bot/py/utils.py b/content_access_bot/py/utils.py new file mode 100644 index 00000000..d81e7b44 --- /dev/null +++ b/content_access_bot/py/utils.py @@ -0,0 +1,116 @@ +import re +from urllib.parse import urlparse, urlunparse +import aiohttp +from datetime import datetime, timedelta + + +def validate_url(url): + regex = re.compile( + r'^(?:http|ftp)s?://' # http:// or https:// + # domain... + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' + r'localhost|' # localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', re.IGNORECASE) + + parsed_url = urlparse(url) + if parsed_url.scheme == '': + url = 'http://' + url + parsed_url = urlparse(url) + + url_unparsed = urlunparse(parsed_url) + if isinstance(url_unparsed, bytes): + url_str = url_unparsed.decode('utf-8') + else: + url_str = url_unparsed + + if re.match(regex, url_str) is not None: + return url_str + return None + + +def clean_url(url): + parsed_url = urlparse(url) + cleaned_url = urlunparse( + (parsed_url.scheme, parsed_url.netloc, "", "", "", "")) + return cleaned_url.rstrip('/') + + +def url_redirects(original, final): + parsed_original = urlparse(original) + parsed_final = urlparse(final) + + original_netloc_path = parsed_original.netloc.replace( + 'www.', '') + parsed_original.path.rstrip('/') + final_netloc_path = parsed_final.netloc.replace( + 'www.', '') + parsed_final.path.rstrip('/') + + return original_netloc_path != final_netloc_path + + +async def check_site_availability(url: str): + async with aiohttp.ClientSession() as session: + try: + async with session.get(url, allow_redirects=True) as response: + return { + "status_code": response.status, + "reachable": True, + "redirect": url_redirects(url, str(response.url)), + "final_url": str(response.url) + } + except Exception: + return { + "status_code": None, + "reachable": False, + "redirect": False, + "final_url": None + } + + +def get_robots_url(url: str): + parsed_url = urlparse(url) + robots_url = urlunparse( + (parsed_url.scheme, parsed_url.netloc, "/robots.txt", "", "", "")) + return robots_url.rstrip('/') + +def is_within_time_frame(date_str, days, date_format="%Y-%m-%d"): + """ + Returns True if date_str is within 'days' from today. + date_str: string date (e.g. '2024-06-19') + days: int, number of days from today + date_format: format of date_str (default: '%Y-%m-%d') + """ + target_date = datetime.strptime(date_str, date_format) + today = datetime.today() + delta = today - target_date + return 0 <= delta.days <= days + + +def find_closest_snapshot(snapshots, date, date_key="timestamp"): + """ + Finds the snapshot closest to the given date. + If there are snapshots before or on the date, returns the latest one before or on the date. + If all snapshots are after the date, returns the oldest snapshot. + """ + if not snapshots: + return None + + snapshots_sorted = sorted(snapshots, key=lambda x: x[date_key]) + before_or_on = [s for s in snapshots_sorted if s[date_key] <= date] + if before_or_on: + return before_or_on[-1] + else: + return snapshots_sorted[0] + +def format_db_date(date_str): + """ + Converts a date string like '20240619120000' to 'YYYY-MM-DD HH:MM:SS'. + Returns None if input is None or invalid. + """ + if not date_str: + return None + try: + return datetime.strptime(date_str, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S") + except Exception: + return date_str diff --git a/pants.toml b/pants.toml index 60faf5b6..74552680 100644 --- a/pants.toml +++ b/pants.toml @@ -40,6 +40,8 @@ root_patterns = [ "/pants-plugins", "/pesacheck_meedan_bridge/py", "/pesacheck_meedan_bridge/docker", + "/content_access_bot/py", + "/content_access_bot/docker", ] [python] @@ -57,7 +59,7 @@ args = ["--quiet"] [mypy] requirements = ["django-stubs"] -interpreter_constraints = ["==3.11.*"] +interpreter_constraints = ["==3.10.*","==3.11.*", "==3.12.*", "==3.13.*"] [pyupgrade] args = ["--py36-plus"]