""" Semantic File System - Auto-organizing file management No more downloads folder chaos. Files organize themselves based on: - Content (what's in them) - Purpose (why they exist) - Context (what they're related to) - Usage patterns (how they're accessed) This is what file management should have been from the start. """ from dataclasses import dataclass, field from datetime import datetime from enum import Enum from pathlib import Path from typing import Dict, List, Optional, Set, Any import hashlib import mimetypes import json import re class DocumentType(Enum): """Semantic document types - not just file extensions""" RESUME = "resume" COVER_LETTER = "cover_letter" BUSINESS_PLAN = "business_plan" TECHNICAL_SPEC = "technical_spec" MEETING_NOTES = "meeting_notes" FINANCIAL_DOC = "financial_doc" CONTRACT = "contract" RESEARCH_PAPER = "research_paper" CODE = "code" DATA = "data" IMAGE = "image" VIDEO = "video" AUDIO = "audio" ARCHIVE = "archive" CONFIG = "config" DOCUMENTATION = "documentation" PRESENTATION = "presentation" SPREADSHEET = "spreadsheet" EMAIL = "email" CHAT_LOG = "chat_log" UNKNOWN = "unknown" class DocumentPurpose(Enum): """Why does this document exist?""" REFERENCE = "reference" # For looking things up ACTIVE_WORK = "active_work" # Currently working on ARCHIVE = "archive" # Historical record TEMPLATE = "template" # To be copied/used as starting point COLLABORATION = "collaboration" # Shared with others PERSONAL = "personal" # Just for me DELIVERABLE = "deliverable" # To be sent to someone INPUT = "input" # Source material for something else OUTPUT = "output" # Result of a process @dataclass class SemanticMetadata: """Rich metadata about a file""" # Basic info file_path: str file_hash: str file_size: int mime_type: str created_at: datetime modified_at: datetime last_accessed: datetime # Semantic classification document_type: DocumentType = DocumentType.UNKNOWN purpose: DocumentPurpose = DocumentPurpose.REFERENCE confidence: float = 0.0 # Confidence in classification # Content analysis title: Optional[str] = None summary: Optional[str] = None keywords: Set[str] = field(default_factory=set) entities: Dict[str, List[str]] = field(default_factory=dict) # people, orgs, dates, etc. # Relationships related_files: Set[str] = field(default_factory=set) parent_project: Optional[str] = None tags: Set[str] = field(default_factory=set) # Usage patterns access_count: int = 0 edit_count: int = 0 share_count: int = 0 # Intent graph link intent_node_ids: Set[str] = field(default_factory=set) # Custom metadata metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict: return { 'file_path': self.file_path, 'file_hash': self.file_hash, 'file_size': self.file_size, 'mime_type': self.mime_type, 'created_at': self.created_at.isoformat(), 'modified_at': self.modified_at.isoformat(), 'last_accessed': self.last_accessed.isoformat(), 'document_type': self.document_type.value, 'purpose': self.purpose.value, 'confidence': self.confidence, 'title': self.title, 'summary': self.summary, 'keywords': list(self.keywords), 'entities': self.entities, 'related_files': list(self.related_files), 'parent_project': self.parent_project, 'tags': list(self.tags), 'access_count': self.access_count, 'edit_count': self.edit_count, 'share_count': self.share_count, 'intent_node_ids': list(self.intent_node_ids), 'metadata': self.metadata } class SemanticFileSystem: """ A file system that understands what files ARE, not just where they're stored. Key features: - Auto-classification based on content - Semantic search (find by purpose, not just name) - Auto-organization (files suggest where they belong) - Relationship tracking (what's related to what) - Intent-aware (files know why they exist) """ def __init__(self, index_path: str = ".semantic_fs_index.json"): self.index_path = index_path self.files: Dict[str, SemanticMetadata] = {} self.load_index() def load_index(self): """Load the semantic index from disk""" try: if Path(self.index_path).exists(): with open(self.index_path, 'r') as f: # TODO: Implement full deserialization pass except Exception as e: print(f"Error loading index: {e}") def save_index(self): """Save the semantic index to disk""" data = { 'files': { path: metadata.to_dict() for path, metadata in self.files.items() } } with open(self.index_path, 'w') as f: json.dump(data, f, indent=2) def analyze_file(self, file_path: str) -> SemanticMetadata: """ Analyze a file and extract semantic metadata. This is where the magic happens - understanding what a file IS. """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Basic file info stat = path.stat() mime_type, _ = mimetypes.guess_type(file_path) # Compute hash with open(file_path, 'rb') as f: file_hash = hashlib.sha256(f.read()).hexdigest() metadata = SemanticMetadata( file_path=str(path.absolute()), file_hash=file_hash, file_size=stat.st_size, mime_type=mime_type or "application/octet-stream", created_at=datetime.fromtimestamp(stat.st_ctime), modified_at=datetime.fromtimestamp(stat.st_mtime), last_accessed=datetime.fromtimestamp(stat.st_atime) ) # Classify the document doc_type, confidence = self._classify_document(file_path, mime_type) metadata.document_type = doc_type metadata.confidence = confidence # Extract content if it's text-based if self._is_text_file(mime_type): content = self._extract_text(file_path) metadata.keywords = self._extract_keywords(content) metadata.entities = self._extract_entities(content) metadata.title = self._extract_title(content, path.name) metadata.summary = self._generate_summary(content) # Infer purpose based on location and type metadata.purpose = self._infer_purpose(file_path, doc_type) return metadata def _classify_document(self, file_path: str, mime_type: Optional[str]) -> tuple[DocumentType, float]: """ Classify document based on content and structure. Returns (DocumentType, confidence_score) """ path = Path(file_path) extension = path.suffix.lower() # Extension-based classification (basic) ext_map = { '.py': DocumentType.CODE, '.js': DocumentType.CODE, '.ts': DocumentType.CODE, '.java': DocumentType.CODE, '.cpp': DocumentType.CODE, '.c': DocumentType.CODE, '.rs': DocumentType.CODE, '.go': DocumentType.CODE, '.pdf': DocumentType.UNKNOWN, # Need content analysis '.docx': DocumentType.UNKNOWN, # Need content analysis '.doc': DocumentType.UNKNOWN, '.txt': DocumentType.UNKNOWN, '.md': DocumentType.DOCUMENTATION, '.csv': DocumentType.DATA, '.json': DocumentType.DATA, '.xml': DocumentType.DATA, '.yaml': DocumentType.CONFIG, '.yml': DocumentType.CONFIG, '.png': DocumentType.IMAGE, '.jpg': DocumentType.IMAGE, '.jpeg': DocumentType.IMAGE, '.gif': DocumentType.IMAGE, '.mp4': DocumentType.VIDEO, '.mp3': DocumentType.AUDIO, '.zip': DocumentType.ARCHIVE, '.tar': DocumentType.ARCHIVE, '.gz': DocumentType.ARCHIVE, '.pptx': DocumentType.PRESENTATION, '.xlsx': DocumentType.SPREADSHEET, } if extension in ext_map: doc_type = ext_map[extension] if doc_type != DocumentType.UNKNOWN: return doc_type, 0.8 # Content-based classification for unknown types if self._is_text_file(mime_type): content = self._extract_text(file_path) return self._classify_by_content(content, path.name) return DocumentType.UNKNOWN, 0.0 def _classify_by_content(self, content: str, filename: str) -> tuple[DocumentType, float]: """Classify document by analyzing its content""" content_lower = content.lower() filename_lower = filename.lower() # Resume detection resume_keywords = ['resume', 'curriculum vitae', 'cv', 'experience', 'education', 'skills'] resume_score = sum(1 for kw in resume_keywords if kw in content_lower or kw in filename_lower) if resume_score >= 3: return DocumentType.RESUME, min(0.9, 0.3 * resume_score) # Cover letter if ('dear' in content_lower and 'sincerely' in content_lower) or 'cover letter' in filename_lower: return DocumentType.COVER_LETTER, 0.7 # Business plan business_keywords = ['executive summary', 'market analysis', 'financial projections', 'business model'] if sum(1 for kw in business_keywords if kw in content_lower) >= 2: return DocumentType.BUSINESS_PLAN, 0.8 # Technical spec tech_keywords = ['architecture', 'requirements', 'specification', 'api', 'implementation'] if sum(1 for kw in tech_keywords if kw in content_lower) >= 2: return DocumentType.TECHNICAL_SPEC, 0.7 # Meeting notes meeting_keywords = ['meeting', 'attendees', 'action items', 'agenda'] if sum(1 for kw in meeting_keywords if kw in content_lower) >= 2: return DocumentType.MEETING_NOTES, 0.7 return DocumentType.UNKNOWN, 0.0 def _infer_purpose(self, file_path: str, doc_type: DocumentType) -> DocumentPurpose: """Infer why this file exists based on location and type""" path = Path(file_path) path_lower = str(path).lower() # Location-based inference if 'download' in path_lower: return DocumentPurpose.INPUT if 'archive' in path_lower or 'backup' in path_lower: return DocumentPurpose.ARCHIVE if 'template' in path_lower: return DocumentPurpose.TEMPLATE if 'draft' in path_lower or 'wip' in path_lower: return DocumentPurpose.ACTIVE_WORK if 'output' in path_lower or 'export' in path_lower: return DocumentPurpose.OUTPUT # Type-based inference if doc_type == DocumentType.RESUME: return DocumentPurpose.DELIVERABLE if doc_type == DocumentType.TEMPLATE: return DocumentPurpose.TEMPLATE if doc_type == DocumentType.MEETING_NOTES: return DocumentPurpose.REFERENCE return DocumentPurpose.REFERENCE def _is_text_file(self, mime_type: Optional[str]) -> bool: """Check if file is text-based""" if not mime_type: return False return mime_type.startswith('text/') or mime_type in [ 'application/json', 'application/xml', 'application/javascript' ] def _extract_text(self, file_path: str) -> str: """Extract text content from file""" try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: return f.read() except Exception: return "" def _extract_keywords(self, content: str, max_keywords: int = 20) -> Set[str]: """Extract important keywords from content""" # Simple keyword extraction - in production, use TF-IDF or similar words = re.findall(r'\b[a-z]{4,}\b', content.lower()) # Remove common words stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'will', 'your', 'their'} words = [w for w in words if w not in stop_words] # Count frequency word_freq = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 # Get top keywords top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:max_keywords] return set(word for word, _ in top_words) def _extract_entities(self, content: str) -> Dict[str, List[str]]: """Extract named entities (people, places, orgs, dates, etc.)""" # Simplified entity extraction - in production, use NER entities = { 'emails': [], 'urls': [], 'dates': [], 'phone_numbers': [] } # Extract emails entities['emails'] = re.findall(r'\b[\w.-]+@[\w.-]+\.\w+\b', content) # Extract URLs entities['urls'] = re.findall(r'https?://[^\s]+', content) # Extract dates (simple patterns) entities['dates'] = re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', content) # Extract phone numbers (simple pattern) entities['phone_numbers'] = re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', content) return entities def _extract_title(self, content: str, filename: str) -> str: """Extract or infer document title""" lines = content.split('\n') # Look for common title patterns for line in lines[:10]: # Check first 10 lines line = line.strip() if not line: continue # Markdown heading if line.startswith('# '): return line[2:].strip() # If it's a short line at the start, might be a title if len(line) < 100 and len(line) > 5: return line # Fall back to filename return Path(filename).stem.replace('_', ' ').replace('-', ' ').title() def _generate_summary(self, content: str, max_length: int = 200) -> str: """Generate a brief summary of the content""" # Simple summary - first few sentences sentences = re.split(r'[.!?]+', content) summary = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(summary) + len(sentence) > max_length: break summary += sentence + ". " return summary.strip() def index_file(self, file_path: str) -> SemanticMetadata: """Index a file in the semantic file system""" metadata = self.analyze_file(file_path) self.files[metadata.file_path] = metadata self.save_index() return metadata def search(self, query: str, filters: Optional[Dict] = None) -> List[SemanticMetadata]: """ Semantic search - find files by content, purpose, type, etc. Not just filename matching! """ results = [] query_lower = query.lower() for metadata in self.files.values(): score = 0.0 # Match against title if metadata.title and query_lower in metadata.title.lower(): score += 2.0 # Match against keywords if any(query_lower in kw for kw in metadata.keywords): score += 1.5 # Match against summary if metadata.summary and query_lower in metadata.summary.lower(): score += 1.0 # Match against filename if query_lower in Path(metadata.file_path).name.lower(): score += 0.5 # Apply filters if filters: if 'document_type' in filters and metadata.document_type != filters['document_type']: continue if 'purpose' in filters and metadata.purpose != filters['purpose']: continue if 'tags' in filters and not set(filters['tags']).intersection(metadata.tags): continue if score > 0: results.append((metadata, score)) # Sort by score results.sort(key=lambda x: x[1], reverse=True) return [metadata for metadata, _ in results] def suggest_location(self, file_path: str) -> str: """ Suggest where a file should be organized. This solves the "downloads folder chaos" problem. """ metadata = self.analyze_file(file_path) # Base directory structure base_map = { DocumentType.RESUME: "documents/career/resumes", DocumentType.COVER_LETTER: "documents/career/cover_letters", DocumentType.BUSINESS_PLAN: "documents/business", DocumentType.TECHNICAL_SPEC: "documents/technical", DocumentType.MEETING_NOTES: "documents/meetings", DocumentType.FINANCIAL_DOC: "documents/financial", DocumentType.CONTRACT: "documents/legal", DocumentType.CODE: "code", DocumentType.DATA: "data", DocumentType.IMAGE: "media/images", DocumentType.VIDEO: "media/videos", DocumentType.AUDIO: "media/audio", DocumentType.DOCUMENTATION: "docs", DocumentType.PRESENTATION: "documents/presentations", DocumentType.SPREADSHEET: "documents/spreadsheets", } base_dir = base_map.get(metadata.document_type, "misc") # Add purpose subdirectory if metadata.purpose == DocumentPurpose.ARCHIVE: base_dir += "/archive" elif metadata.purpose == DocumentPurpose.TEMPLATE: base_dir += "/templates" elif metadata.purpose == DocumentPurpose.ACTIVE_WORK: base_dir += "/active" # Add project subdirectory if applicable if metadata.parent_project: base_dir += f"/{metadata.parent_project}" filename = Path(file_path).name return f"{base_dir}/{filename}" def auto_organize(self, file_path: str, dry_run: bool = True) -> str: """ Automatically organize a file based on its semantic classification. dry_run=True: Just return where it should go dry_run=False: Actually move the file """ suggested_path = self.suggest_location(file_path) if not dry_run: # Create directory if needed Path(suggested_path).parent.mkdir(parents=True, exist_ok=True) # Move the file Path(file_path).rename(suggested_path) # Update index if file_path in self.files: metadata = self.files.pop(file_path) metadata.file_path = suggested_path self.files[suggested_path] = metadata self.save_index() return suggested_path # Example usage if __name__ == "__main__": sfs = SemanticFileSystem() # Example: Analyze a resume # metadata = sfs.index_file("~/Downloads/john_doe_resume.pdf") # print(f"Document type: {metadata.document_type}") # print(f"Suggested location: {sfs.suggest_location('~/Downloads/john_doe_resume.pdf')}") # Example: Search for all resumes # resumes = sfs.search("", filters={'document_type': DocumentType.RESUME}) # for resume in resumes: # print(f"Found resume: {resume.title} at {resume.file_path}") print("Semantic File System initialized")