The Download Pipeline
Downloading a 10-K filing from EDGAR requires four steps:
- Look up the CIK — The company's unique SEC identifier (CIK lookup guide)
- Get the submissions list — Find the 10-K filing and its accession number
- Construct the filing URL — Navigate the EDGAR directory structure
- Download the document — Fetch the HTML/text content
Understanding EDGAR's Directory Structure
Every filing on EDGAR follows this URL pattern:
https://www.sec.gov/Archives/edgar/data/{CIK}/{ACCESSION_NO_DASHES}/{FILENAME}
For example, Apple's 2024 10-K filing:
https://www.sec.gov/Archives/edgar/data/320193/000032019324000123/aapl-20240928.htm
The accession number is the key. It uniquely identifies each filing and is formatted as XXXXXXXXXX-YY-ZZZZZZ (filer ID, year, sequence). In URLs, the dashes are removed.
Python Implementation
Step 1: Look Up CIK and Get Filing List
import requests
import time
import os
from html.parser import HTMLParser
HEADERS = {'User-Agent': 'FilingDownloader [email protected]'}
def get_submissions(cik):
"""Fetch the filing submissions for a company."""
cik_padded = str(cik).zfill(10)
url = f'https://data.sec.gov/submissions/CIK{cik_padded}.json'
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
return response.json()
Step 2: Find the 10-K Filing
def find_10k_filings(submissions, count=5):
"""Find the most recent 10-K filings from submissions data.
Returns list of dicts with accession number, date, and primary document.
"""
recent = submissions['filings']['recent']
filings = []
for i in range(len(recent['form'])):
if recent['form'][i] == '10-K':
filings.append({
'accession': recent['accessionNumber'][i],
'date': recent['filingDate'][i],
'primary_doc': recent['primaryDocument'][i],
'description': recent['primaryDocDescription'][i],
})
if len(filings) >= count:
break
return filings
# Example: Find Apple's recent 10-K filings
submissions = get_submissions(320193)
filings_10k = find_10k_filings(submissions)
for f in filings_10k:
print(f" {f['date']}: {f['accession']} - {f['primary_doc']}")
Step 3: Construct the Download URL
def build_filing_url(cik, accession, filename):
"""Construct the full URL to a filing document on EDGAR."""
accession_no_dashes = accession.replace('-', '')
return (f'https://www.sec.gov/Archives/edgar/data/'
f'{cik}/{accession_no_dashes}/{filename}')
# Build URL for Apple's latest 10-K
filing = filings_10k[0]
url = build_filing_url(320193, filing['accession'], filing['primary_doc'])
print(f'Filing URL: {url}')
Step 4: Download and Save the Filing
def download_filing(cik, accession, filename, save_dir='filings'):
"""Download a filing document and save it to disk."""
url = build_filing_url(cik, accession, filename)
time.sleep(0.12) # Rate limit: 10 req/sec
response = requests.get(url, headers=HEADERS)
response.raise_for_status()
# Create save directory if needed
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, f'{accession}_{filename}')
with open(save_path, 'w', encoding='utf-8') as f:
f.write(response.text)
print(f' Saved: {save_path} ({len(response.text):,} chars)')
return save_path
# Download Apple's latest 10-K
filing = filings_10k[0]
path = download_filing(320193, filing['accession'], filing['primary_doc'])
Complete Script: Download Multiple 10-K Filings
def download_company_10ks(ticker, cik, count=3):
"""Download the most recent 10-K filings for a company."""
print(f'\nFetching {ticker} (CIK {cik})...')
submissions = get_submissions(cik)
filings = find_10k_filings(submissions, count=count)
if not filings:
print(f' No 10-K filings found for {ticker}')
return []
paths = []
for filing in filings:
try:
path = download_filing(
cik, filing['accession'], filing['primary_doc'],
save_dir=f'filings/{ticker}'
)
paths.append(path)
except Exception as e:
print(f' Error downloading {filing["accession"]}: {e}')
return paths
# Download 10-Ks for several companies
companies = {
'AAPL': '320193',
'MSFT': '789019',
'TSLA': '1318605',
}
for ticker, cik in companies.items():
download_company_10ks(ticker, cik, count=3)
JavaScript Implementation
const HEADERS = { 'User-Agent': 'FilingDownloader [email protected]' };
async function getSubmissions(cik) {
const padded = String(cik).padStart(10, '0');
const url = `https://data.sec.gov/submissions/CIK${padded}.json`;
const res = await fetch(url, { headers: HEADERS });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
}
async function downloadLatest10K(cik) {
const subs = await getSubmissions(cik);
const recent = subs.filings.recent;
// Find the first 10-K
const idx = recent.form.findIndex(f => f === '10-K');
if (idx === -1) throw new Error('No 10-K found');
const accession = recent.accessionNumber[idx];
const filename = recent.primaryDocument[idx];
const accNoDashes = accession.replace(/-/g, '');
const filingUrl = `https://www.sec.gov/Archives/edgar/data/${cik}/${accNoDashes}/${filename}`;
// Rate limit
await new Promise(r => setTimeout(r, 120));
const res = await fetch(filingUrl, { headers: HEADERS });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return {
url: filingUrl,
date: recent.filingDate[idx],
accession,
html: await res.text()
};
}
// Usage
const filing = await downloadLatest10K('320193');
console.log(`Downloaded Apple 10-K from ${filing.date}`);
console.log(` URL: ${filing.url}`);
console.log(` Size: ${filing.html.length.toLocaleString()} chars`);
Extracting Text from HTML Filings
Most modern 10-K filings are HTML documents. To extract plain text for analysis:
from bs4 import BeautifulSoup
def extract_text(html_content):
"""Extract clean text from an HTML SEC filing."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for tag in soup(['script', 'style']):
tag.decompose()
text = soup.get_text(separator='\n')
# Clean up whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
return '\n'.join(lines)
# Read a downloaded filing and extract text
with open('filings/AAPL/0000320193-24-000123_aapl-20240928.htm') as f:
html = f.read()
text = extract_text(html)
print(f'Extracted {len(text):,} characters of text')
print(text[:500]) # Preview first 500 chars
Handling the EDGAR Filing Index
Each filing has an index page listing all associated documents (HTML, XBRL, exhibits). Access it by replacing the filename with index.json:
def get_filing_index(cik, accession):
"""Get the filing index listing all documents."""
acc_no_dashes = accession.replace('-', '')
url = f'https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no_dashes}/index.json'
time.sleep(0.12)
response = requests.get(url, headers=HEADERS)
return response.json()
# List all documents in a filing
index = get_filing_index(320193, filings_10k[0]['accession'])
for item in index['directory']['item']:
print(f" {item['name']} ({item.get('type', 'unknown')})")
Rate Limit Handling
The SEC blocks IPs that exceed 10 requests per second. For bulk downloads, implement proper throttling:
import time
class RateLimiter:
"""Simple rate limiter for SEC API calls."""
def __init__(self, max_per_second=8):
self.min_interval = 1.0 / max_per_second
self.last_call = 0
def wait(self):
elapsed = time.time() - self.last_call
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_call = time.time()
limiter = RateLimiter(max_per_second=8) # Stay under the 10/sec limit
for ticker, cik in companies.items():
limiter.wait()
submissions = get_submissions(cik)
# ... process filings
For more details on rate limits and caching strategies, see our SEC EDGAR API Rate Limits & Best Practices guide.
FAQ
How do I download a 10-K filing from SEC EDGAR?
Look up the company's CIK, fetch their submissions from data.sec.gov, find the 10-K accession number, then download the document from the EDGAR Archives directory.
What is an accession number?
An accession number uniquely identifies each SEC filing. Format: XXXXXXXXXX-YY-ZZZZZZ (filer CIK, year, sequence). You need it to construct the download URL.
Can I download SEC filings in bulk?
Yes, but respect the 10 req/sec rate limit. For very large downloads, the SEC provides bulk data files at sec.gov/files/.
What format are SEC 10-K filings in?
Most modern 10-K filings are HTML. Older filings may be plain text. Some include XBRL in XML format. The filing index lists all available formats.
How do I extract text from an HTML SEC filing?
Use Python's BeautifulSoup library to parse HTML and extract text. Install with pip install beautifulsoup4, then use soup.get_text() to strip HTML tags.
Related Guides
- Free SEC EDGAR API Guide — Complete overview of all EDGAR API endpoints
- SEC CIK Number Lookup Guide — Find any company's CIK instantly
- SEC EDGAR Full-Text Search API — Search the text of every filing
- Build a Stock Screener with the SEC EDGAR API — Python tutorial
- SEC EDGAR API Rate Limits & Best Practices — Avoid getting blocked