diff --git a/.gitignore b/.gitignore
index 49f2aebc..406bb55b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -168,3 +168,6 @@ cython_debug/
 # Custom gitignore
 *.db
 # End of custom ignore
+
+*.csv
+*.xlsx
diff --git a/3rdparty/py/requirements-all.txt b/3rdparty/py/requirements-all.txt
index 64830120..2694b8e3 100644
--- a/3rdparty/py/requirements-all.txt
+++ b/3rdparty/py/requirements-all.txt
@@ -1,3 +1,5 @@
+aiohttp==3.9.3
+backoff==2.2.1
 boto3==1.40.23
 celery==5.5.3
 dj-rest-auth==7.0.1
@@ -17,8 +19,12 @@ greenlet==3.2.4
 gunicorn[gevent, setproctitle]==23.0.0
 html2text==2025.4.15
 lxml==6.0.1
+openpyxl==3.1.5
+pandas==2.3.0
+pyairtable==2.3.3
 redis==6.4.0
 requests==2.32.5
+scrapy==2.12.0
 sentry-sdk==2.36.0
 tablib[xlsx]==3.8.0
 trafilatura==1.12.2
diff --git a/content_access_bot/.env.example b/content_access_bot/.env.example
new file mode 100644
index 00000000..4801ab9b
--- /dev/null
+++ b/content_access_bot/.env.example
@@ -0,0 +1,5 @@
+AIRTABLE_BASE_ID=
+AIRTABLE_API_KEY=
+AIRTABLE_ORGANISATION_TABLE=
+AIRTABLE_CONTENT_TABLE=
+DB_FILE=content_access_bot.db
diff --git a/content_access_bot/docker/BUILD b/content_access_bot/docker/BUILD
new file mode 100644
index 00000000..7f61598f
--- /dev/null
+++ b/content_access_bot/docker/BUILD
@@ -0,0 +1,33 @@
+python_sources()
+docker_image(
+    name="content_access_bot-deps",
+    image_tags=["deps"],
+    build_platform=["linux/amd64", "linux/arm64"],
+    registries=["content_access_bot"],
+    repository="app",
+    skip_push=True,
+    source="Dockerfile.deps",
+)
+
+file(name="app.json", source="app.json")
+
+docker_image(
+    name="content_access_bot-srcs",
+    image_tags=["srcs"],
+    build_platform=["linux/amd64", "linux/arm64"],
+    registries=["content_access_bot"],
+    repository="app",
+    skip_push=True,
+    source="Dockerfile.srcs",
+)
+
+docker_image(
+    name="content_access_bot",
+    build_platform=["linux/amd64", "linux/arm64"],
+    dependencies=[":content_access_bot-srcs", ":content_access_bot-deps", ":app.json"],
+    image_tags=[
+        "{build_args.VERSION}",
+        "latest",
+    ],
+    source="Dockerfile",
+)
diff --git a/content_access_bot/docker/Dockerfile b/content_access_bot/docker/Dockerfile
new file mode 100644
index 00000000..f208c837
--- /dev/null
+++ b/content_access_bot/docker/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.11-slim-bookworm AS python-base
+FROM content_access_bot/app:deps AS app-deps
+FROM content_access_bot/app:srcs AS app-srcs
+FROM python-base AS python-app
+
+WORKDIR /app
+COPY  content_access_bot/docker/app.json ./
+COPY --from=app-deps /app ./
+COPY --from=app-srcs /app ./
+
+CMD ["tail", "-f", "/dev/null"]
diff --git a/content_access_bot/docker/Dockerfile.deps b/content_access_bot/docker/Dockerfile.deps
new file mode 100644
index 00000000..3f502fc9
--- /dev/null
+++ b/content_access_bot/docker/Dockerfile.deps
@@ -0,0 +1,4 @@
+FROM python:3.11-slim-bookworm
+
+COPY content_access_bot.py/content_access_bot-deps@environment=linux.pex /content_access_bot-deps.pex
+RUN PEX_TOOLS=1 python /content_access_bot-deps.pex venv --scope=deps --compile /app
diff --git a/content_access_bot/docker/Dockerfile.srcs b/content_access_bot/docker/Dockerfile.srcs
new file mode 100644
index 00000000..9a280e85
--- /dev/null
+++ b/content_access_bot/docker/Dockerfile.srcs
@@ -0,0 +1,4 @@
+FROM python:3.11-slim-bookworm
+
+COPY content_access_bot.py/content_access_bot-srcs@environment=linux.pex /content_access_bot-srcs.pex
+RUN PEX_TOOLS=1 python /content_access_bot-srcs.pex venv --scope=srcs --compile /app
diff --git a/content_access_bot/docker/app.json b/content_access_bot/docker/app.json
new file mode 100644
index 00000000..5b55346a
--- /dev/null
+++ b/content_access_bot/docker/app.json
@@ -0,0 +1,9 @@
+{
+    "name": "content_access_bot",
+    "cron": [
+        {
+            "command": "./pex",
+            "schedule": "@daily"
+        }
+    ]
+}
diff --git a/content_access_bot/py/BUILD b/content_access_bot/py/BUILD
new file mode 100644
index 00000000..e6e49b14
--- /dev/null
+++ b/content_access_bot/py/BUILD
@@ -0,0 +1,46 @@
+python_sources(
+    name="lib",
+    dependencies=[
+        "3rdparty/py:requirements-all#aiohttp",
+        "3rdparty/py:requirements-all#backoff",
+        "3rdparty/py:requirements-all#environs",
+        "3rdparty/py:requirements-all#pyairtable",
+        "3rdparty/py:requirements-all#scrapy",
+        "3rdparty/py:requirements-all#openpyxl",
+        "3rdparty/py:requirements-all#pandas",
+        "content_access_bot/py/pipeline.py:lib"
+    ],
+)
+
+pex_binary(
+    name="content_access_bot-deps",
+    environment=parametrize("__local__", "linux"),
+    dependencies=[
+        ":lib",
+    ],
+    entry_point="main.py",
+    include_sources=False,
+    include_tools=True,
+    layout="packed",
+)
+
+pex_binary(
+    name="content_access_bot-srcs",
+    environment=parametrize("__local__", "linux"),
+    dependencies=[
+        ":lib",
+    ],
+    entry_point="main.py",
+    include_requirements=False,
+    include_tools=True,
+    layout="packed",
+)
+
+
+pex_binary(
+    name="content_access_bot",
+    dependencies=[
+        ":lib",
+    ],
+    entry_point="main.py",
+)
diff --git a/content_access_bot/py/VERSION b/content_access_bot/py/VERSION
new file mode 100644
index 00000000..8acdd82b
--- /dev/null
+++ b/content_access_bot/py/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/content_access_bot/py/airtable.py b/content_access_bot/py/airtable.py
new file mode 100644
index 00000000..013f8867
--- /dev/null
+++ b/content_access_bot/py/airtable.py
@@ -0,0 +1,85 @@
+from pyairtable import Api
+from utils import validate_url, clean_url
+import os
+import logging
+import re
+from environs import Env
+env = Env()
+dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env')
+
+env.read_env(dotenv_path)
+
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
+api_key = os.getenv('AIRTABLE_API_KEY')
+base_id = os.getenv('AIRTABLE_BASE_ID')
+organisations_table = os.getenv('AIRTABLE_ORGANISATION_TABLE')
+content_table = os.getenv('AIRTABLE_CONTENT_TABLE')
+
+if not api_key or not base_id or not organisations_table or not content_table:
+    raise ValueError('API key, base ID and Organisation table are required')
+
+at = Api(api_key)
+
+
+def get_table_data(table_name, formula=None, fields=None):
+    if not base_id:
+        logging.error(f"AIRTABLE_BASE_ID Not Provided")
+        return
+    table = at.table(base_id, table_name)
+    return table.all(formula=formula, fields=fields)
+
+
+def get_formula(allowed_countries=None):
+    base_formula = 'AND(NOT({Organisation Name} = ""), NOT({Website} = ""), NOT({HQ Country} = ""))'
+    if allowed_countries:
+        countries_formula = ', '.join(
+            [f'({{HQ Country}} = "{country}")' for country in allowed_countries])
+        formula = f'AND({base_formula}, OR({countries_formula}))'
+    else:
+        formula = base_formula
+    return formula
+
+
+def process_records(data):
+    organizations = []
+    for record in data:
+        website = validate_url(record['fields'].get('Website', None))
+        name = record['fields'].get('Organisation Name', None)
+        country = record['fields'].get('HQ Country', None)
+        id: str = record['id']
+        if website:
+            org = {}
+            org['id'] = id
+            org['name'] = re.sub(
+                r'[\\/*?:"<>|]', '-', name) if name else None
+            org['url'] = clean_url(website)
+            org['country'] = country
+
+            organizations.append(org)
+    return organizations
+
+
+def get_organizations(allowed_countries=None):
+    logging.info('Fetching organizations from Airtable')
+    formula = get_formula(allowed_countries)
+    fields = ['Organisation Name', 'Website', 'HQ Country']
+    data = get_table_data(organisations_table, formula, fields)
+    organizations = process_records(data)
+    logging.info(f'Fetched {len(organizations)} organizations')
+    return organizations
+
+
+async def batch_upsert_organizations(data):
+    logging.info('Upserting organizations in Airtable')
+    try:
+        if not base_id or not content_table:
+            logging.error(f"AIRTABLE_BASE_ID or AIRTABLE_CONTENT_TABLE Not Provided")
+            return
+        table = at.table(base_id, content_table)
+        table.batch_upsert(records=data, key_fields=['id',])
+        logging.info('Organizations upserted successfully')
+    except Exception as e:
+        logging.error(f'Error upserting organization: {e}')
diff --git a/content_access_bot/py/db.py b/content_access_bot/py/db.py
new file mode 100644
index 00000000..3ad6e46c
--- /dev/null
+++ b/content_access_bot/py/db.py
@@ -0,0 +1,340 @@
+import os
+from environs import Env, env
+from sqlite3 import Error, connect
+import logging
+from dataclasses import dataclass
+from typing import Optional
+from datetime import datetime
+
+env = Env()
+dotenv_path = os.path.join(os.path.dirname(__file__), ".env")
+env.read_env(dotenv_path)
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
+@dataclass
+class MediaHouse:
+    name: str
+    country: str
+    url: str
+    airtable_id: str
+    id: Optional[str] = None
+
+@dataclass
+class SiteCheck:
+    airtable_id: str
+    site_status: Optional[str] = None
+    site_reachable: Optional[bool] = None
+    site_redirect: Optional[bool] = None
+    final_url: Optional[str] = None
+    robots_url: Optional[str] = None
+    robots_timestamp: Optional[str] = None
+    robots_content: Optional[str] = None
+    robots_status: Optional[str] = None
+    check_timestamp: Optional[str] = None
+
+class Database:
+    def __init__(self):
+        self.db_file = os.getenv('DB_FILE') or 'media_data'
+        self.conn = self.create_connection()
+        self.create_table()
+
+    def create_connection(self):
+        try:
+            conn = connect(self.db_file, timeout=30)
+            return conn
+        except Error as e:
+            logging.error(f"Error creating connection: {e}")
+    
+    def is_connected(self):
+        return self.conn is not None
+
+    def create_table(self):
+        create_table_sql = """
+        CREATE TABLE IF NOT EXISTS media_data (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            country TEXT NOT NULL,
+            url TEXT NOT NULL,
+            airtable_id TEXT NOT NULL UNIQUE
+        );
+        
+        CREATE TABLE IF NOT EXISTS site_checks (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            airtable_id TEXT NOT NULL,
+            site_status TEXT,
+            site_reachable BOOLEAN,
+            site_redirect BOOLEAN,
+            final_url TEXT,
+            robots_url TEXT,
+            robots_timestamp TEXT,
+            robots_content TEXT,
+            robots_status TEXT,
+            check_timestamp TEXT NOT NULL,
+            FOREIGN KEY(airtable_id) REFERENCES media_data(airtable_id)
+        );
+        
+        CREATE TABLE IF NOT EXISTS internet_archive_snapshots(
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            airtable_id TEXT NOT NULL, 
+            url TEXT NOT NULL,
+            archive_date TEXT NOT NULL UNIQUE,
+            archive_robots_url TEXT,
+            archived_content TEXT,
+            archived_retrieval_date TEXT, 
+            FOREIGN KEY(airtable_id) REFERENCES media_data(airtable_id)
+        );
+        """
+        try:
+            if self.conn is not None:
+                cursor = self.conn.cursor()
+                cursor.executescript(create_table_sql)
+                self.conn.commit()
+                logging.info("Database tables created or already exist.")
+            else:
+                logging.error("Database connection is not established. Table creation skipped.")
+        except Error as e:
+            logging.error(f"Error creating table: {e}")
+    
+    def insert_media_house(self, media_house: MediaHouse):
+        try:
+            sql = """
+            INSERT OR IGNORE INTO media_data(name, country, url, airtable_id)
+            VALUES(?, ?, ?, ?)
+            """
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                cur.execute(sql, (media_house.name, media_house.country,
+                            media_house.url, media_house.airtable_id))
+                self.conn.commit()
+                return cur.lastrowid
+            else:
+               logging.error("Database connection is not established.")
+        except Error as e:
+            logging.error(f"Error inserting media house: {e}")
+
+    def close_connection(self, cur):
+        if cur is not None:
+                cur.close()
+
+    def select_media_houses_without_status(self):
+        """Legacy method - now returns media houses without recent checks"""
+        return self.select_media_houses_without_recent_check()
+
+    def select_media_houses_without_recent_check(self, max_age_days=7):
+        """Select media houses that haven't been checked recently or never checked"""
+        cur = None
+        try:
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                # Get media houses with no recent site checks
+                sql = """
+                SELECT md.* FROM media_data md
+                LEFT JOIN (
+                    SELECT airtable_id, MAX(check_timestamp) as latest_check
+                    FROM site_checks 
+                    GROUP BY airtable_id
+                ) sc ON md.airtable_id = sc.airtable_id
+                WHERE sc.latest_check IS NULL 
+                   OR datetime(sc.latest_check) < datetime('now', '-{} days')
+                """.format(max_age_days)
+                cur.execute(sql)
+                rows = cur.fetchall()
+                column_names = [column[0] for column in cur.description]
+                dict_rows = [dict(zip(column_names, row)) for row in rows]
+                return dict_rows
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def insert_site_check(self, site_check: SiteCheck):
+        """Insert a new site check record"""
+        cur = None
+        try:
+            if self.conn is not None:
+                sql = """
+                INSERT INTO site_checks(
+                    airtable_id, site_status, site_reachable, site_redirect, 
+                    final_url, robots_url, robots_timestamp, robots_content, 
+                    robots_status, check_timestamp
+                )
+                VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """
+                cur = self.conn.cursor()
+                check_time = site_check.check_timestamp or datetime.now().strftime("%Y%m%d%H%M%S")
+                cur.execute(sql, (
+                    site_check.airtable_id, site_check.site_status, 
+                    site_check.site_reachable, site_check.site_redirect,
+                    site_check.final_url, site_check.robots_url,
+                    site_check.robots_timestamp, site_check.robots_content,
+                    site_check.robots_status, check_time
+                ))
+                self.conn.commit()
+                return cur.lastrowid
+        except Error as e:
+            logging.error(f"Error inserting site check: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def update_site_status(self, airtable_id, site_status, site_reachable, site_redirect, final_url):
+        """Legacy method - now creates a new site check record for status update"""
+        site_check = SiteCheck(
+            airtable_id=airtable_id,
+            site_status=site_status,
+            site_reachable=site_reachable,
+            site_redirect=site_redirect,
+            final_url=final_url
+        )
+        return self.insert_site_check(site_check)
+
+    def insert_current_robots(self, airtable_id, robots_url, robots_timestamp, robots_content, robots_status):
+        """Legacy method - now creates a new site check record for robots update"""
+        site_check = SiteCheck(
+            airtable_id=airtable_id,
+            robots_url=robots_url,
+            robots_timestamp=robots_timestamp,
+            robots_content=robots_content,
+            robots_status=robots_status
+        )
+        return self.insert_site_check(site_check)
+
+    def get_all_media_houses(self):
+        cur = None
+        try:
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                cur.execute("SELECT * FROM media_data")
+                rows = cur.fetchall()
+                column_names = [column[0] for column in cur.description]
+                dict_rows = [dict(zip(column_names, row)) for row in rows]
+                return dict_rows
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def get_latest_site_checks(self):
+        """Get the most recent site check for each media house"""
+        cur = None
+        try:
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                sql = """
+                SELECT sc.* FROM site_checks sc
+                INNER JOIN (
+                    SELECT airtable_id, MAX(check_timestamp) as latest_check
+                    FROM site_checks 
+                    GROUP BY airtable_id
+                ) latest ON sc.airtable_id = latest.airtable_id 
+                AND sc.check_timestamp = latest.latest_check
+                """
+                cur.execute(sql)
+                rows = cur.fetchall()
+                column_names = [column[0] for column in cur.description]
+                dict_rows = [dict(zip(column_names, row)) for row in rows]
+                return dict_rows
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def insert_internet_archive_snapshot_url(self, airtable_id, url, archive_date):
+        try:
+            sql = """
+            INSERT INTO internet_archive_snapshots(airtable_id, url, archive_date)
+            VALUES(?, ?, ?)
+            """
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                cur.execute(sql, (airtable_id, url, archive_date))
+                self.conn.commit()
+                return cur.lastrowid
+            else:
+               logging.error("Database connection is not established.")
+        except Error as e:
+            logging.error(f"Error inserting archive snapshot: {e}")
+    
+    def get_all_internet_archive_snapshots(self):
+        cur = None
+        try:
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                cur.execute("SELECT * FROM internet_archive_snapshots")
+                rows = cur.fetchall()
+                column_names = [column[0] for column in cur.description]
+                dict_rows = [dict(zip(column_names, row)) for row in rows]
+                return dict_rows
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def insert_internet_archive_snapshot_robots(self, id, archive_robots_url, archived_content, archived_retrieval_date):
+        cur = None
+        try:
+            if self.conn is not None:
+                sql = """
+                    UPDATE internet_archive_snapshots
+                    SET archive_robots_url = ?, archived_content = ?, archived_retrieval_date = ?
+                    WHERE id = ?
+                """
+                cur = self.conn.cursor()
+                cur.execute(sql, (archive_robots_url, archived_content, archived_retrieval_date, id))
+                self.conn.commit()
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
+
+    def get_combided_data(self):
+        """Legacy method name - calls get_combined_data"""
+        return self.get_combined_data()
+
+    def get_combined_data(self):
+        """Get media data with latest site checks and archive snapshots"""
+        cur = None
+        try:
+            if self.conn is not None:
+                cur = self.conn.cursor()
+                # Get media data with latest site check
+                sql = """
+                SELECT 
+                    md.*,
+                    sc.site_status, sc.site_reachable, sc.site_redirect, sc.final_url,
+                    sc.robots_url, sc.robots_timestamp, sc.robots_content, sc.robots_status,
+                    sc.check_timestamp
+                FROM media_data md
+                LEFT JOIN (
+                    SELECT sc1.* FROM site_checks sc1
+                    INNER JOIN (
+                        SELECT airtable_id, MAX(check_timestamp) as latest_check
+                        FROM site_checks 
+                        GROUP BY airtable_id
+                    ) sc2 ON sc1.airtable_id = sc2.airtable_id 
+                    AND sc1.check_timestamp = sc2.latest_check
+                ) sc ON md.airtable_id = sc.airtable_id
+                """
+                cur.execute(sql)
+                media_rows = cur.fetchall()
+                media_columns = [column[0] for column in cur.description]
+                combined = []
+                
+                for media_row in media_rows:
+                    media_dict = dict(zip(media_columns, media_row))
+                    # Get all snapshots for this airtable_id
+                    cur.execute(
+                        "SELECT * FROM internet_archive_snapshots WHERE airtable_id = ?",
+                        (media_dict["airtable_id"],)
+                    )
+                    snapshot_rows = cur.fetchall()
+                    snapshot_columns = [column[0] for column in cur.description]
+                    snapshots = [dict(zip(snapshot_columns, row)) for row in snapshot_rows]
+                    media_dict["snapshots"] = snapshots
+                    combined.append(media_dict)
+                return combined
+        except Error as e:
+            logging.error(f"Error: {e}")
+        finally:
+            self.close_connection(cur)
diff --git a/content_access_bot/py/diff.py b/content_access_bot/py/diff.py
new file mode 100644
index 00000000..eefaeea7
--- /dev/null
+++ b/content_access_bot/py/diff.py
@@ -0,0 +1,68 @@
+ai_crawlers = [
+    "Amazonbot",
+    "anthropic-ai",
+    "AwarioRssBot",
+    "AwarioSmartBot",
+    "Bard",
+    "Bloom",
+    "Bytespider",
+    "CCBot",
+    "ChatGPT",
+    "ChatGPT-User",
+    "ClaudeBot",
+    "Claude-Web",
+    "cohere-ai"
+    "DataForSeoBot",
+    "Diffbot",
+    "FacebookBot",
+    "GPT-4",
+    "GPT-Neo",
+    "GPTBot",
+    "Google-Extended",
+    "GoogleOther",
+    "HuggingFace-Transformers",
+    "LaMDA",
+    "Megatron-Turing-NLG",
+    "magpie-crawler",
+    "Meltwater",
+    "NewsNow",
+    "news-please",
+    "omgili",
+    "OmigiliBot",
+    "PaLM",
+    "peer39_crawler",
+    "peer39_crawler/1.0",
+    "PerplexityBot"
+    "TurnitinBot",
+    "Seekr",
+    "Scrapy",
+    "Wu-Dao-2.0",
+]
+
+
+def diff_robot_content(current_robots_content: str, archived_robots_content: str):
+    """
+    Compares two robots.txt contents.
+    Returns:
+        - blocks_crawlers: True if current robots.txt blocks any AI crawlers
+        - blocked_crawlers: List of AI crawlers blocked in current robots.txt
+        - ai_blocking_update: True if current robots.txt blocks AI crawlers but archived did not
+    """
+    current_content = current_robots_content or ""
+    archived_content = archived_robots_content or ""
+
+    blocked_crawlers = [
+        crawler for crawler in ai_crawlers if crawler.casefold() in current_content.casefold()
+    ]
+    previously_blocked_crawlers = [
+        crawler for crawler in ai_crawlers if crawler.casefold() in archived_content.casefold()
+    ]
+
+    blocks_crawlers = bool(blocked_crawlers)
+    ai_blocking_update = blocks_crawlers and not previously_blocked_crawlers
+
+    return {
+        "blocks_crawlers": blocks_crawlers,
+        "blocked_crawlers": ', '.join(blocked_crawlers),
+        "ai_blocking_update": ai_blocking_update,
+    }
diff --git a/content_access_bot/py/main.py b/content_access_bot/py/main.py
new file mode 100644
index 00000000..f49eba82
--- /dev/null
+++ b/content_access_bot/py/main.py
@@ -0,0 +1,247 @@
+from datetime import datetime, timedelta
+import logging
+import time
+import logging
+import asyncio
+from airtable import get_organizations, batch_upsert_organizations
+from scrapy.crawler import CrawlerProcess
+import pandas as pd
+from db import Database, MediaHouse, SiteCheck
+from diff import diff_robot_content
+from spider import ArchivedRobotsSpider, ArchivedURLsSpider, RobotsSpider
+from utils import check_site_availability, get_robots_url,find_closest_snapshot,format_db_date
+
+
+MAX_ROBOTS_AGE = 7 # No of Days to skip fetching of current robots
+MAX_INTERNATE_ARCHIVE_AGE =365
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
+async def fetch_orgs(db:Database):
+    organizations = get_organizations()
+    for media_house in organizations:
+        media_house_obj = MediaHouse(
+            media_house['name'], media_house['country'], media_house['url'], media_house['id']
+        )
+        db.insert_media_house(media_house_obj)
+
+
+async def check_org_sites(db:Database):
+    unchecked_orgs = db.select_media_houses_without_recent_check()
+    if not unchecked_orgs:
+        logging.info(f"No sites to check")
+        return
+    count = len(unchecked_orgs) if unchecked_orgs is not None else 0
+    logging.info(f"Checking {count} sites")
+
+    async def update_org_site(org):
+        site_status = await check_site_availability(org['url'])
+        db.update_site_status(
+            org['airtable_id'], site_status['status_code'],
+            site_status['reachable'], site_status['redirect'], site_status['final_url']
+        )
+    #TODO:Use Spider to check sites
+    await asyncio.gather(*(update_org_site(org) for org in unchecked_orgs))
+    logging.info("Finished checking Sites")
+
+
+async def fetch_robots(db: Database):
+    all_media_houses = db.get_all_media_houses()
+    if not all_media_houses:
+        logging.info(f"No sites to check")
+        return
+
+    # Get latest site checks to determine which need robot fetching
+    latest_checks = db.get_latest_site_checks()
+    check_dict = {check['airtable_id']: check for check in latest_checks}
+    
+    filtered_media_houses = []
+    today = datetime.now()
+    
+    for media_house in all_media_houses:
+        airtable_id = media_house['airtable_id']
+        latest_check = check_dict.get(airtable_id)
+        
+        if not latest_check or not latest_check.get('robots_content'):
+            filtered_media_houses.append(media_house)
+            continue
+            
+        robots_timestamp = latest_check.get('robots_timestamp')
+        if robots_timestamp:
+            try:
+                robots_date = datetime.strptime(robots_timestamp, "%Y%m%d%H%M%S")
+                if (today - robots_date).days > MAX_ROBOTS_AGE:
+                    filtered_media_houses.append(media_house)
+            except Exception as e:
+                logging.warning(f"Invalid robots_timestamp for {airtable_id}: {robots_timestamp}")
+
+    count = len(filtered_media_houses)
+    if count == 0:
+        logging.info("No robots to fetch within the specified timeframe.")
+        return
+
+    logging.info(f"Fetching Robots for {count} sites")
+    urls = [(media_house['airtable_id'], get_robots_url(media_house['url']))
+            for media_house in filtered_media_houses]
+    process = CrawlerProcess(settings={
+        'ITEM_PIPELINES': {
+            'pipeline.RobotsDatabasePipeline': 1
+        },
+    }, install_root_handler=False)
+    process.crawl(RobotsSpider, urls)
+    process.start()
+
+async def fetch_internet_archive_snapshots(db: Database):
+    logging.info("fetch_internet_archive_snapshots")
+    all_media_houses = db.get_all_media_houses()
+    if not all_media_houses:
+        logging.info(f"No sites to fetch internet archive snapshots")
+        return
+
+    all_archive_snapshots = db.get_all_internet_archive_snapshots()
+    # Get set of airtable_ids that already have snapshots
+    fetched_airtable_ids = set(s['airtable_id'] for s in all_archive_snapshots) if all_archive_snapshots else set()
+    # Filter only media houses not yet fetched
+    to_fetch = [media_house for media_house in all_media_houses if media_house['airtable_id'] not in fetched_airtable_ids]
+
+    count = len(to_fetch)
+    if count == 0:
+        logging.info("No new sites to fetch internet archive snapshots for.")
+        return
+
+    logging.info(f"Fetching Internet Archive snapshots for {count} sites")
+    target_date = (datetime.now() - timedelta(days=MAX_INTERNATE_ARCHIVE_AGE)).strftime("%Y%m%d%H%M%S")
+    urls = [(media_house['airtable_id'], media_house['url']) for media_house in to_fetch]
+    process = CrawlerProcess(settings={
+        'ITEM_PIPELINES': {
+            'pipeline.ArchivedURLsDatabasePipeline': 2
+        },
+    }, install_root_handler=False)
+    process.crawl(ArchivedURLsSpider, urls=urls, target_date=target_date)
+    process.start()
+
+async def fetch_archived_robots(db:Database):
+    logging.info("Fetching Archived Robots.tx")
+    all_archived_snapshot_url = db.get_all_internet_archive_snapshots()
+    if not all_archived_snapshot_url:
+        logging.info(f"No sites to fetch internet archive snapshots")
+        return
+
+    today = datetime.now()
+    filtered_snapshots = []
+    for snapshot in all_archived_snapshot_url:
+        archived_content = snapshot.get('archived_content')
+        archived_retrieval_date = snapshot.get('archived_retrieval_date')
+        # If archived_content is None or empty, always include
+        if not archived_content:
+            filtered_snapshots.append(snapshot)
+            continue
+        # If archived_retrieval_date exists, check if it's older than MAX_ROBOTS_AGE days
+        if archived_retrieval_date:
+            try:
+                retrieval_date = datetime.strptime(archived_retrieval_date, "%Y%m%d%H%M%S")
+                if (today - retrieval_date).days > MAX_ROBOTS_AGE:
+                    filtered_snapshots.append(snapshot)
+            except Exception as e:
+                logging.warning(f"Invalid archived_retrieval_date for {snapshot.get('id')}: {archived_retrieval_date}")
+
+    count = len(filtered_snapshots)
+    if count == 0:
+        logging.info("No archived robots to fetch within the specified timeframe.")
+        return
+
+    logging.info(f"Fetching Robots for {count} sites")
+    urls = [(snapshot['id'], f"{snapshot['url']}/robots.txt")
+                for snapshot in filtered_snapshots]
+    process = CrawlerProcess(settings={
+        'ITEM_PIPELINES': {
+            'pipeline.ArchivedRobotsDatabasePipeline': 3
+        },
+    }, install_root_handler=False) 
+    process.crawl(ArchivedRobotsSpider, urls)
+    process.start()
+
+
+async def generate_report(db: Database):
+    combined_data = db.get_combined_data()
+    if not combined_data:
+        logging.info("No Data to generate report from")
+        return
+    target_date = (datetime.now() - timedelta(days=MAX_INTERNATE_ARCHIVE_AGE)).strftime("%Y%m%d%H%M%S")
+    report_rows = []
+
+    for media in combined_data:
+        snapshots = media.get("snapshots", [])
+        closest_snapshot = find_closest_snapshot(snapshots, target_date,date_key="archive_date")
+        archived_content = ""
+        row = {
+            "Name": media.get("name"),
+            "Country": media.get("country"),
+            "URL": media.get("url"),
+            "Airtable ID": media.get("airtable_id"),
+            "Site Status": media.get("site_status"),
+            "Site Reachable": bool(media.get("site_reachable")),
+            "Site Redirect": bool(media.get("site_redirect")),
+            "Final URL": media.get("final_url"),
+            "Robots URL": media.get("robots_url"),
+            "Date Robots Fetched": format_db_date(media.get("robots_timestamp")),
+            "Robot Content": (
+                "''" if media.get("robots_content") == "" else media.get("robots_content")
+            ),
+            "Robot Status": media.get("robots_status"),
+        }
+        if closest_snapshot:
+            row.update({
+                "Archive URL": closest_snapshot.get("url"),
+                "Archive Date": format_db_date(closest_snapshot.get("archive_date")),
+                "Archive Robots URL": closest_snapshot.get("archive_robots_url"),
+                "Archive Robot Content": (
+                    "''" if closest_snapshot.get("archive_robots_url") == "" else closest_snapshot.get("archive_robots_url")
+                ),
+                "Archive Retrievel Date": format_db_date(closest_snapshot.get("archived_retrieval_date")),
+            })
+            archived_content = closest_snapshot.get("archived_content") 
+        else:
+            row.update({
+                "Archive URL": None,
+                "Archive Date": None,
+                "Archive Robots URL": None,
+                "Archive Robot Content": None,
+                "Archive Retrievel Date": None,
+            })
+        report_rows.append(row)
+
+        diff_data = diff_robot_content(media.get("robots_content"),archived_content)
+
+        row.update(({
+            "Blocks AI Crawlers": diff_data['blocks_crawlers'],
+            "Blocked AI Crawler": diff_data['blocked_crawlers'],
+            "Update Robots to block AI":diff_data['ai_blocking_update']
+        }))
+        
+
+    df = pd.DataFrame(report_rows)
+    filename = f"Report-{target_date}.xlsx"
+    df.to_excel(filename, index=False)
+
+async def main(db:Database):
+    await fetch_orgs(db)
+    await check_org_sites(db) # Often Not Required unless site status is required
+    await fetch_robots(db)
+    await fetch_internet_archive_snapshots(db)
+    await fetch_archived_robots(db)
+    await generate_report(db)
+
+
+if __name__ == "__main__":
+    try:
+        start_time = time.time()
+        db = Database()
+        if not db.is_connected():
+            logging.error("Failed to connect to the database")
+            exit(1)
+        asyncio.run(main(db))
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+    
diff --git a/content_access_bot/py/pipeline.py b/content_access_bot/py/pipeline.py
new file mode 100644
index 00000000..45d9ad5d
--- /dev/null
+++ b/content_access_bot/py/pipeline.py
@@ -0,0 +1,47 @@
+
+from db import Database
+
+
+class RobotsDatabasePipeline:
+    def __init__(self):
+        self.db = Database()
+
+    def process_item(self, item, spider):
+        self.db.insert_current_robots(
+            item["airtable_id"],
+            item["robots_url"],
+            item["robots_timestamp"],
+            item["robots_content"],
+            item["robots_status"]
+        )
+        return item
+
+class ArchivedURLsDatabasePipeline:
+    def __init__(self):
+        self.db = Database() 
+
+    def process_item(self, item, spider):
+        # Save the archived URL to the DB
+        self.db.insert_internet_archive_snapshot_url(
+            item["airtable_id"],
+            item["url"],
+            item["archive_date"]
+        )
+        return item
+
+class ArchivedRobotsDatabasePipeline:
+    def __init__(self):
+        self.db = Database()
+    
+    def process_item(self, item, spider):
+        # Save the archived robots to the DB
+        print("ArchivedRobotsDatabasePipeline:", item)
+        self.db.insert_internet_archive_snapshot_robots(
+            item["id"],
+            item["archive_robots_url"],
+            item["archived_content"],
+            item["archived_retrieval_date"]
+        )
+        return item
+
+    
\ No newline at end of file
diff --git a/content_access_bot/py/spider.py b/content_access_bot/py/spider.py
new file mode 100644
index 00000000..4087a6ba
--- /dev/null
+++ b/content_access_bot/py/spider.py
@@ -0,0 +1,103 @@
+import datetime
+import scrapy
+
+from utils import find_closest_snapshot
+
+
+
+class RobotsSpider(scrapy.Spider):
+    name = 'robots'
+    start_urls = []
+
+    def __init__(self, urls=None, *args, **kwargs):
+        super(RobotsSpider, self).__init__(*args, **kwargs)
+        if urls:
+            self.start_urls = urls
+
+    def start_requests(self):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+
+        for airtable_id, url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse, meta={'airtable_id': airtable_id}, headers=headers)
+
+    def parse(self, response):
+        yield {
+            "airtable_id":response.meta['airtable_id'],
+            "robots_url": response.url,
+            "robots_timestamp": datetime.datetime.now().strftime("%Y%m%d%H%M%S"),
+            "robots_content":response.text,
+            "robots_status":response.status
+        }
+class ArchivedURLsSpider(scrapy.Spider):
+    name = 'archived_urls'
+    start_urls = []
+
+    def __init__(self, urls=None, target_date=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if urls:
+            self.start_urls = urls
+        # target_date should be a string like "20230618000000"
+        self.target_date = target_date or (datetime.datetime.now() - datetime.timedelta(days=365)).strftime("%Y%m%d%H%M%S")
+
+    def start_requests(self):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+        for airtable_id, url in self.start_urls:
+            cdx_url = f"https://web.archive.org/cdx/search/cdx?url={url}"
+            yield scrapy.Request(
+                url=cdx_url,
+                callback=self.parse_cdx,
+                meta={'airtable_id': airtable_id, 'url':url},
+                headers=headers
+            )
+
+    def parse_cdx(self, response):
+        url = response.meta['url']
+        airtable_id = response.meta['airtable_id']
+        lines = response.text.strip().split("\n")
+        snapshots = []
+        for line in lines:
+            fields = line.split(" ")
+            if len(fields) == 7:
+                timestamp = fields[1]
+                status= fields[4]
+                snapshots.append({
+                    "url": f"https://web.archive.org/web/{timestamp}if_/{url}",
+                    "timestamp": timestamp,
+                })
+        closest = find_closest_snapshot(snapshots, self.target_date)
+        print("Closest Snapshot:", closest)
+        if closest:
+            yield {
+                "airtable_id": airtable_id,
+                "url": closest['url'],
+                "archive_date": closest["timestamp"]
+            }
+
+class ArchivedRobotsSpider(scrapy.Spider):
+    name = 'archived_robots'
+    start_urls = []
+
+    def __init__(self, urls=None, *args, **kwargs):
+        super(ArchivedRobotsSpider, self).__init__(*args, **kwargs)
+        if urls:
+            self.start_urls = urls
+
+    def start_requests(self):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+
+        for id, url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse, meta={'id': id}, headers=headers)
+
+    def parse(self, response):
+        yield {
+            "id": response.meta['id'],
+            "archive_robots_url":response.url,
+            "archived_content":response.text,
+            "archived_retrieval_date":datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+        }
diff --git a/content_access_bot/py/utils.py b/content_access_bot/py/utils.py
new file mode 100644
index 00000000..d81e7b44
--- /dev/null
+++ b/content_access_bot/py/utils.py
@@ -0,0 +1,116 @@
+import re
+from urllib.parse import urlparse, urlunparse
+import aiohttp
+from datetime import datetime, timedelta
+
+
+def validate_url(url):
+    regex = re.compile(
+        r'^(?:http|ftp)s?://'  # http:// or https://
+        # domain...
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+
+    parsed_url = urlparse(url)
+    if parsed_url.scheme == '':
+        url = 'http://' + url
+        parsed_url = urlparse(url)
+
+    url_unparsed = urlunparse(parsed_url)
+    if isinstance(url_unparsed, bytes):
+        url_str = url_unparsed.decode('utf-8')
+    else:
+        url_str = url_unparsed
+
+    if re.match(regex, url_str) is not None:
+        return url_str
+    return None
+
+
+def clean_url(url):
+    parsed_url = urlparse(url)
+    cleaned_url = urlunparse(
+        (parsed_url.scheme, parsed_url.netloc, "", "", "", ""))
+    return cleaned_url.rstrip('/')
+
+
+def url_redirects(original, final):
+    parsed_original = urlparse(original)
+    parsed_final = urlparse(final)
+
+    original_netloc_path = parsed_original.netloc.replace(
+        'www.', '') + parsed_original.path.rstrip('/')
+    final_netloc_path = parsed_final.netloc.replace(
+        'www.', '') + parsed_final.path.rstrip('/')
+
+    return original_netloc_path != final_netloc_path
+
+
+async def check_site_availability(url: str):
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(url, allow_redirects=True) as response:
+                return {
+                    "status_code": response.status,
+                    "reachable": True,
+                    "redirect": url_redirects(url, str(response.url)),
+                    "final_url": str(response.url)
+                }
+        except Exception:
+            return {
+                "status_code": None,
+                "reachable": False,
+                "redirect": False,
+                "final_url": None
+            }
+
+
+def get_robots_url(url: str):
+    parsed_url = urlparse(url)
+    robots_url = urlunparse(
+        (parsed_url.scheme, parsed_url.netloc, "/robots.txt", "", "", ""))
+    return robots_url.rstrip('/')
+
+def is_within_time_frame(date_str, days, date_format="%Y-%m-%d"):
+    """
+    Returns True if date_str is within 'days' from today.
+    date_str: string date (e.g. '2024-06-19')
+    days: int, number of days from today
+    date_format: format of date_str (default: '%Y-%m-%d')
+    """
+    target_date = datetime.strptime(date_str, date_format)
+    today = datetime.today()
+    delta = today - target_date
+    return 0 <= delta.days <= days
+
+
+def find_closest_snapshot(snapshots, date, date_key="timestamp"):
+    """
+    Finds the snapshot closest to the given date.
+    If there are snapshots before or on the date, returns the latest one before or on the date.
+    If all snapshots are after the date, returns the oldest snapshot.
+    """
+    if not snapshots:
+        return None
+
+    snapshots_sorted = sorted(snapshots, key=lambda x: x[date_key])
+    before_or_on = [s for s in snapshots_sorted if s[date_key] <= date]
+    if before_or_on:
+        return before_or_on[-1]
+    else:
+        return snapshots_sorted[0]
+
+def format_db_date(date_str):
+    """
+    Converts a date string like '20240619120000' to 'YYYY-MM-DD HH:MM:SS'.
+    Returns None if input is None or invalid.
+    """
+    if not date_str:
+        return None
+    try:
+        return datetime.strptime(date_str, "%Y%m%d%H%M%S").strftime("%Y-%m-%d %H:%M:%S")
+    except Exception:
+        return date_str 
diff --git a/pants.toml b/pants.toml
index 60faf5b6..74552680 100644
--- a/pants.toml
+++ b/pants.toml
@@ -40,6 +40,8 @@ root_patterns = [
   "/pants-plugins",
   "/pesacheck_meedan_bridge/py",
   "/pesacheck_meedan_bridge/docker",
+  "/content_access_bot/py",
+  "/content_access_bot/docker",
 ]
 
 [python]
@@ -57,7 +59,7 @@ args = ["--quiet"]
 
 [mypy]
 requirements = ["django-stubs"]
-interpreter_constraints = ["==3.11.*"]
+interpreter_constraints = ["==3.10.*","==3.11.*", "==3.12.*", "==3.13.*"]
 
 [pyupgrade]
 args = ["--py36-plus"]