Spaces:
Running
Running
| """ | |
| AIFinder Feature Extraction | |
| TF-IDF pipeline + stylometric features. | |
| Supports CoT-aware and no-CoT text preprocessing. | |
| """ | |
| import re | |
| import numpy as np | |
| from scipy.sparse import hstack, csr_matrix | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import MaxAbsScaler | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS | |
| # --- Text Preprocessing --- | |
| def strip_cot(text): | |
| """Remove <think>...</think> blocks from text.""" | |
| return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip() | |
| def has_cot(text): | |
| """Check if text contains <think>...</think> blocks.""" | |
| return bool(re.search(r"<think>.*?</think>", text, flags=re.DOTALL)) | |
| def cot_ratio(text): | |
| """Ratio of thinking text to total text length.""" | |
| think_matches = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL) | |
| if not think_matches or len(text) == 0: | |
| return 0.0 | |
| think_len = sum(len(m) for m in think_matches) | |
| return think_len / len(text) | |
| # --- Stylometric Features --- | |
| class StylometricFeatures(BaseEstimator, TransformerMixin): | |
| """Extract stylometric features from text.""" | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| features = [] | |
| for text in X: | |
| features.append(self._extract(text)) | |
| return csr_matrix(np.array(features, dtype=np.float32)) | |
| def _extract(self, text): | |
| sentences = re.split(r'[.!?]+', text) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| words = text.split() | |
| n_chars = max(len(text), 1) | |
| n_words = max(len(words), 1) | |
| n_sentences = max(len(sentences), 1) | |
| # Basic stats | |
| avg_word_len = np.mean([len(w) for w in words]) if words else 0 | |
| avg_sent_len = n_words / n_sentences | |
| # Punctuation densities | |
| n_commas = text.count(",") / n_chars | |
| n_semicolons = text.count(";") / n_chars | |
| n_colons = text.count(":") / n_chars | |
| n_exclaim = text.count("!") / n_chars | |
| n_question = text.count("?") / n_chars | |
| n_ellipsis = text.count("...") / n_chars | |
| n_dash = (text.count("—") + text.count("--")) / n_chars | |
| # Markdown elements | |
| n_headers = len(re.findall(r'^#{1,6}\s', text, re.MULTILINE)) / n_sentences | |
| n_bold = len(re.findall(r'\*\*.*?\*\*', text)) / n_sentences | |
| n_italic = len(re.findall(r'(?<!\*)\*(?!\*).*?(?<!\*)\*(?!\*)', text)) / n_sentences | |
| n_code_blocks = len(re.findall(r'```', text)) / n_sentences | |
| n_inline_code = len(re.findall(r'`[^`]+`', text)) / n_sentences | |
| n_bullet = len(re.findall(r'^[\s]*[-*+]\s', text, re.MULTILINE)) / n_sentences | |
| n_numbered = len(re.findall(r'^\s*\d+[.)]\s', text, re.MULTILINE)) / n_sentences | |
| # Vocabulary richness | |
| unique_words = len(set(w.lower() for w in words)) | |
| ttr = unique_words / n_words # type-token ratio | |
| # Paragraph structure | |
| paragraphs = text.split("\n\n") | |
| n_paragraphs = len([p for p in paragraphs if p.strip()]) | |
| avg_para_len = n_words / max(n_paragraphs, 1) | |
| # Special patterns | |
| starts_with_certainly = 1.0 if re.match(r'^(Certainly|Of course|Sure|Absolutely|Great question)', text, re.IGNORECASE) else 0.0 | |
| has_disclaimer = 1.0 if re.search(r"(I'm an AI|as an AI|language model|I cannot|I can't help)", text, re.IGNORECASE) else 0.0 | |
| # CoT features (present even in no-CoT mode, just will be 0) | |
| has_think = 1.0 if has_cot(text) else 0.0 | |
| think_ratio = cot_ratio(text) | |
| return [ | |
| avg_word_len, avg_sent_len, | |
| n_commas, n_semicolons, n_colons, n_exclaim, n_question, | |
| n_ellipsis, n_dash, | |
| n_headers, n_bold, n_italic, n_code_blocks, n_inline_code, | |
| n_bullet, n_numbered, | |
| ttr, n_paragraphs, avg_para_len, | |
| starts_with_certainly, has_disclaimer, | |
| has_think, think_ratio, | |
| n_chars, n_words, | |
| ] | |
| # --- Feature Pipeline --- | |
| class FeaturePipeline: | |
| """Combined TF-IDF + stylometric feature pipeline.""" | |
| def __init__(self): | |
| self.word_tfidf = TfidfVectorizer(**TFIDF_WORD_PARAMS) | |
| self.char_tfidf = TfidfVectorizer(**TFIDF_CHAR_PARAMS) | |
| self.stylo = StylometricFeatures() | |
| self.scaler = MaxAbsScaler() | |
| def fit_transform(self, texts): | |
| """Fit and transform texts into feature matrix.""" | |
| import time | |
| print(f" Input: {len(texts)} texts") | |
| # Strip <think> blocks for TF-IDF so n-grams learn style, not CoT | |
| texts_no_cot = [strip_cot(t) for t in texts] | |
| t0 = time.time() | |
| word_features = self.word_tfidf.fit_transform(texts_no_cot) | |
| print(f" word tfidf: {word_features.shape[1]} features ({time.time()-t0:.1f}s)") | |
| t0 = time.time() | |
| char_features = self.char_tfidf.fit_transform(texts_no_cot) | |
| print(f" char tfidf: {char_features.shape[1]} features ({time.time()-t0:.1f}s)") | |
| # Stylometric uses original text (has_think, think_ratio still work) | |
| t0 = time.time() | |
| stylo_features = self.stylo.fit_transform(texts) | |
| print(f" stylometric: {stylo_features.shape[1]} features ({time.time()-t0:.1f}s)") | |
| combined = hstack([word_features, char_features, stylo_features]) | |
| combined = self.scaler.fit_transform(combined) | |
| print(f" Combined feature matrix: {combined.shape}") | |
| return combined | |
| def transform(self, texts): | |
| """Transform texts into feature matrix (after fitting).""" | |
| texts_no_cot = [strip_cot(t) for t in texts] | |
| word_features = self.word_tfidf.transform(texts_no_cot) | |
| char_features = self.char_tfidf.transform(texts_no_cot) | |
| stylo_features = self.stylo.transform(texts) | |
| combined = hstack([word_features, char_features, stylo_features]) | |
| return self.scaler.transform(combined) | |