# Managed by AEO Robots.txt by J. Ottenheijm.com

##################################################
#  AUTO ALLOWED OK / GOOD / NORMAL               #
##################################################

# Google Ads quality crawler.
User-agent: Adsbot-Google
Allow: /

# Ahrefs SEO crawler keeps link index current.
User-agent: AhrefsBot
Allow: /

# Ahrefs Site Audit tool crawler.
User-agent: AhrefsSiteAudit
Allow: /

# Crawler used by Archive‑It collections and national libraries.
User-agent: archive.org_bot
Allow: /

# Cloud‑based heartbeat/incident monitor from Better Stack; identifies as Better Uptime Bot.
User-agent: Better Uptime Bot
Allow: /

# DuckDuckGo states the data is not used to train AI models.
User-agent: DuckAssistBot
Allow: /

# Internet Archive Wayback Machine spider that snapshots pages for public archiving.
User-agent: ia_archiver
Allow: /

# Synthetic‑monitor probes from SolarWinds Pingdom that measure uptime and latency from many POPs.
User-agent: Pingdom.com_bot
Allow: /

# Hits your endpoint every 30–60s to check that it’s alive for UptimeRobot dashboards.
User-agent: UptimeRobot/2.0
Allow: /

##################################################
# ALLOWED BY USER                                #
##################################################

# Model-training: NO | Crawls pages so Andi AI search/answer engine can surface and cite your content.
User-agent: Andibot
Allow: /

# Model-training: Likely YES | Amazon confirms the data is reused for question-answering AI; most observers treat it as a model-training + retrieval crawler.
User-agent: Amazonbot
Allow: /

# Model-training: Probably YES | Alias seen in logs; likewise tagged as an undocumented AI agent believed to gather training data.
User-agent: anthropic-ai
Allow: /

# Model-training: NO | Apple’s standard crawler for Siri, Spotlight and Apple Search.
User-agent: Applebot
Allow: /

# Model-training: YES | If allowed. - Opt-in flag that lets Apple reuse Applebot-crawled pages to train its forthcoming Apple Intelligence models.
User-agent: Applebot-Extended
Allow: /

# Model-training: YES | Baidu search crawler; index also underpins Baidu’s ERNIE generative models.
User-agent: Baiduspider/2.0
Allow: /

# Model-training: NO | Customer-configured web crawler that ingests specified URLs into an Amazon Bedrock knowledge base for retrieval/RAG apps.
User-agent: bedrockbot
Allow: /

# Model-training: NO | High‑volume SEO scraper reported by site owners; no public link to AI work.
User-agent: Brightbot 1.0
Allow: /

# Model-training: NO | v1.* string - One-off fetcher when a ChatGPT user pastes or clicks a URL.
User-agent: ChatGPT-User
Allow: /

# Model-training: NO | v2.* string - Model-training: NO - One-off fetcher when a ChatGPT user pastes or clicks a URL.
User-agent: ChatGPT-User/2.0
Allow: /

# Model-training: YES | Collects web data to enhance the utility and safety of our generative AI models.
User-agent: Claude-Bot
Allow: /

# Model-training: NO | Indexes public web content to improve relevance/quality of Claude search answers.
User-agent: Claude-SearchBot
Allow: /

# Model-training: NO | Fetches pages in real time when an individual Claude user requests info.
User-agent: Claude-User
Allow: /

# Model-training: Probably YES | Collects web data to enhance the utility and safety of our generative AI models.
User-agent: Claude-Web
Allow: /

# Model-training: Potentially YES | Diffbot data is marketed for AI model training. Crawls sites and turns pages into structured JSON that clients (some being AI labs) can license.
User-agent: Diffbot
Allow: /

# Model-training: NO | Grabs article metadata/thumbnails to optimize social distribution and analytics.
User-agent: EchoboxBot
Allow: /

# Model-training: Potentially NO | Discord’s preview fetcher for embedded links and attachments.
User-agent: Discordbot/2.0
Allow: /

# Model-training: NO | Meta crawler that grabs Open Graph tags so Facebook/Instagram/Messenger can show link previews.
User-agent: facebookexternalhit/1.1
Allow: /

#  Model-training: YES | Bulk crawler for GPT-4/5 and future model training and safety tuning.
User-agent: GPTBot
Allow: /

# Model-training: YES | If you allow it. Policy flag, not a crawler: tells Google whether its Bard/Gemini & Vertex AI models may use already-crawled content for training.
User-agent: Google-Extended
Allow: /

# Model-training: YES | Google mobile user‑agent used for rendering.
User-agent: Googlebot-Mobile
Allow: /

# Model-training: YES | Google miscellaneous crawler used for research & AI model development outside core Search.
User-agent: GoogleOther
Allow: /

# Model-training: NO | Fetches pages at the request of site owners to build Vertex AI Agents / grounding corporations; not used for Google Search.
User-agent: Google-CloudVertexBot
Allow: /

# Model-training: YES | GoogleOther variant focused on downloading image bytes for R&D/AI.
User-agent: GoogleOther‑Image
Allow: /

# Model-training: YES | GoogleOther variant focused on fetching video bytes for R&D/AI.
User-agent: GoogleOther‑Video
Allow: /

# Model-training: NO | Legacy Chinese search‑engine spider from iAsk.
User-agent: iaskspider/2.0
Allow: /

# Model-training: NO | Generic crawler with no AI link documented.
User-agent: ICC‑Crawler	
Allow: /

# Model-training: YES | Hive’s bot that scrapes publicly available images for its visual‑intelligence products.
User-agent: ImagesiftBot
Allow: /

# Model-training: YES | Open‑source script that bulk‑downloads image URLs into ML‑ready datasets.
User-agent: img2dataset
Allow: /

# Model-training: NO | Fetches images on‑the‑fly to resize/optimise them; not an AI crawler.
User-agent: imgproxy
Allow: /

# Model-training: NO | LinkedIn scraper for titles/images/descriptions used in shared links.
User-agent: LinkedInBot/1.0
Allow: /

# Model-training: NO | Event-triggered, not continuous, and Meta lists it only for preview caching, not AI.
User-agent: FacebookBot
Allow: /

# Model-training: YES | Explicitly a model-training bot. Large-scale crawler for use cases such as training AI models or improving products by indexing content directly. Runs even when no user has shared the link.
User-agent: meta-externalagent
Allow: /

# Model-training: NO | Meta’s regular link‑preview fetcher for Facebook/Instagram sharing.
User-agent: Meta‑ExternalFetcher
Allow: /

# Direct model-training not claimed. Content is fed into Bing index and may be quoted via retrieval-augmented generation, but the text is not bulk-shoveled into foundation-model checkpoints.Copilot uses Bings index.
User-agent: BingBot
Allow: /

# Model-training: NO | Amazon research agent that autonomously clicks through sites to finish user tasks.
User-agent: NovaAct
Allow: /

# Model-training: NO | Indexes pages so SearchGPT / ChatGPT Browse can cite & link your content in real time.
User-agent: OAI-SearchBot
Allow: /

# Model-training: NO | Interactive, not a web-wide scraper. Loads pages in a headless browser only when a user asks the Operator agent to perform a task (e.g. book tickets). It behaves like a human session, not a bulk spider.
User-agent: Operator
Allow: /

# Model-training: ASSUMED NO | Historical/enterprise semantic search & knowledge extraction across public sources.
User-agent: Panscient
Allow: /

# Model-training: ASSUMED NO | Historical/enterprise semantic search & knowledge extraction across public sources.
User-agent: panscient.com
Allow: /

#  Model-training: NO | Builds the Perplexity.ai search index; returns citations with links.
User-agent: PerplexityBot
Allow: /

# Model-training: NO | Not used to crawl content for AI foundation models. User-triggered fetcher that grabs a page so Perplexity can quote & link it in an answer.
User-agent: Perplexity-User/1.0
Allow: /

# Model-training: NO | Standard search crawler indexing sites for Petal Search, Huawei Assistant & AI Search recommendations.
User-agent: PetalBot
Allow: /

# Model-training: NO | Crawls pages so Phind can answer technical & general queries with cited sources.
User-agent: PhindBot
Allow: /

# Model-training: NO | Crawls pages & images to create/update Pins; Pinterest states it is not used to train its image‑generation model.
User-agent: Pinterestbot/1.0
Allow: /

# Model-training: NO | Primarily processes user-submitted text (paraphrasing, grammar, summaries); no evidence of a broad web crawler—appears in some community blocklists out of caution.
User-agent: QuillBot
Allow: /

# Model-training: NO | Primarily processes user-submitted text (paraphrasing, grammar, summaries); no evidence of a broad web crawler—appears in some community blocklists out of caution.
User-agent: quillbot.com
Allow: /

# Model-training: NO | Crawls to map backlinks for Semrush backlink index.
User-agent: SemrushBot-BA
Allow: /

# Model-training: NO | Fetches pages for Semrush SEO Content Template & related on-page tools.
User-agent: SemrushBot-CT
Allow: /

#  Model-training: NO | Semrush SEO crawler that checks on‑page content.
User-agent: SemrushBot‑OCOB
Allow: /

# Model-training: NO | Technical SEO crawler for Semrush site audit products.
User-agent: SemrushBot-SI
Allow: /

#  Model-training: NO | Semrush Site‑Audit variant for web analytics, not AI.
User-agent: SemrushBot‑SWA
Allow: /

#  Model-training: NO | Slack unfurler that retrieves just enough bytes to extract meta/oEmbed tags.
User-agent: Slackbot‑LinkExpanding 1.0
Allow: /

# Model-training: NO | X (Twitter) bot that fetches Twitter Card markup once per URL to render rich previews.
User-agent: Twitterbot/1.0
Allow: /

# Model-training: Probably NO | Framed as an AI search crawler; no mention of using text in model training.
User-agent: YouBot
Allow: /

# Model-training: NO | Yandex search crawler, claims to not do model-training.
User-agent: YandexBot
Allow: /

# Model-training: NO | Additional/supplemental crawling to support Yandex search services beyond the main YandexBot.
User-agent: YandexAdditional
Allow: /

# Model-training: NO | Additional/supplemental crawling to support Yandex search services beyond the main YandexBot.
User-agent: YandexAdditionalBot
Allow: /


##################################################
#  BLOCKED BECAUSE “BAD” MODEL-TRACKING          #
##################################################

# Model-training: YES | Purpose is to publish data sets that power LLM training. Mass crawler that builds the openly-licensed Common Crawl corpus used by many research labs and companies.
User-agent: AI2Bot
Disallow: /

# Model-training: YES | AI2 crawler that gathers web text for the Dolma corpus powering the open‑source OLMo language model.
User-agent: Ai2Bot‑Dolma
Disallow: /

# Model-training: YES | Explicitly an AI data scraper for model training. Scrapes web pages to train open-source models focused on Australian language & culture.
User-agent: KangarooBot
Disallow: /

# Model-training: YES | By design. The index can be used to train LLMs. Distributed crawler whose index can be licensed for LLM training.
User-agent: TimpiBot
Disallow: /

##################################################
# BLOCKED BECAUSE “BAD”                         #
##################################################

User-agent: 008
User-agent: discobot
User-agent: dotbot
User-agent: Exabot
User-agent: Fasterfox
User-agent: magpie-crawler
User-agent: MJ12bot
User-agent: Node/simplecrawler
User-agent: omgili
User-agent: omgilibot
User-agent: proximic
User-agent: Scrapy
User-agent: yacybot
User-agent: YoudaoBot
Disallow: /

##################################################
# DISALLOWED BY USER                             #
##################################################

# Model-training: NO | Intelligence‑gatherer operated by aiHit.com that profiles company web‑pages for B2B/SEO analytics.
User-agent: aiHitBot
Disallow: /

# Model-training: YES | Mass-scale scraper for Doubao / TikTok LLM training.
User-agent: Bytespider
Disallow: /

# Model-training: YES | Common Crawl | Large‑scale open crawl powering many LLM datasets.
User-agent: CCBot
Disallow: /

# Model-training: ASSUMED YES | Undocumented bot thought to fetch pages for Cohere chat / RAG demos and model work.
User-agent: cohere-ai
Disallow: /

# Model-training: YES | Cohere’s crawler that collects text to train its enterprise LLMs.
User-agent: cohere‑training‑data‑crawler
Disallow: /

# Model-training: YES | Japanese academic crawler harvesting language resources for research corpora.
User-agent: Cotoyogi
Disallow: /

# Model-training: YES | Cloud platform that lets developers crawl the web for agents and LLM pipelines.
User-agent: Crawlspace
Disallow: /

# Model-training: NO | FactSet crawler that gathers financial‑site data for analytics feeds.
User-agent: Factset_spyderbot
Disallow: /

# Model-training: YES | Open‑source Mendable AI crawler that builds knowledge‑bases for generative agents.
User-agent: FirecrawlAgent
Disallow: /

# Model-training: YES | AWS‑hosted scraper marketed as “friendly” but used to collect pages for ML experiments.
User-agent: FriendlyCrawler
Disallow: /

# Model-training: NO | Security scanner that audits sites for cyber‑risk indicators.
User-agent: ISSCyberRiskCrawler	
Disallow: /

# Model-training: NO | Docs say it is not used for generative AI training. On-demand fetcher for user questions in LeChat; includes source links in answers.
User-agent: MistralAI-User
Disallow: /

# Model-training: ASSUMED YES | Appears in community AI blocklists; presumed automated scraping for AI uses.
User-agent: MyCentralAIScraperBot
Disallow: /

# Model-training: YES | Huawei crawler that downloads multimodal data for its PanGu LLM.
User-agent: PanguBot
Disallow: /

# Model-training: ASSUMED YES | Observed automated collection of web content for research/AI purposes.
User-agent: Poseidon Research Crawler
Disallow: /

# Model-training: NO | Visits your site to power chat, visitor intelligence, and go-to-market revenue experiences tied to Salesforce data.
User-agent: QualifiedBot
Disallow: /

# Model-training: ASSUMED YES | Community-reported crawler gathering data for AI/insight tooling.
User-agent: SBIntuitionsBot
Disallow: /

#  Model-training: NO | Bot that indexes B2B invoices/credit‑risk pages for Sidetrade fintech tools.
User-agent: Sidetrade indexer bot
Disallow: /

#  Model-training: YES | ByteDance spider that mirrors public pages and media from TikTok; often used for AI training and content moderation.
User-agent: TikTokSpider
Disallow: /

#  Model-training: YES | Hunter.io crawler that builds business datasets and ML models from public web pages.
User-agent: VelenPublicWebCrawler
Disallow: /

#  Model-training: YES | Webz.io crawler dedicated to collecting data sets that the company resells for AI training.
User-agent: Webzio‑Extended
Disallow: /

# Model-training: ASSUMED YES | Broad web scraping / indexing often from cloud hosts; not tied to a major search engine.
User-agent: wpbot
Disallow: /


##################################################
# WORDPRESS SPECIFIC RULES                       #
##################################################

User-agent: *

# Standard WordPress hygiene
Disallow: /cgi-bin/
Disallow: /wp-admin/
Allow: /wp-admin/admin-ajax.php

Crawl-delay: 3

# Duplicate archive pages
Disallow: /tag/*
Disallow: /author/*
Disallow: */feed/*
Disallow: */page/*
Disallow: */trackback/
Disallow: */embed/

# REST API & XML-RPC
Disallow: /wp-json/
Disallow: /xmlrpc.php

# Attachment pages
Disallow: /*/attachment/*

# Internal search
Disallow: /search*
Disallow: /?s=

# Assets
Allow: /wp-content/uploads/

# START YOAST BLOCK
# ---------------------------
User-agent: *
Disallow:

Sitemap: https://www.rono.nl/sitemap_index.xml
# ---------------------------
# END YOAST BLOCK