Spaces:

ai-assist-sh
/

PhishingMail-Lab

Sleeping

App Files Files Community

ai-assist-sh commited on Aug 21

Commit

a380f06

verified ·

1 Parent(s): 9fe8482

Upload 3 files

Browse files

Files changed (3) hide show

README.md +33 -12
main.py +254 -335
requirements.txt +7 -8

README.md CHANGED Viewed

@@ -1,12 +1,33 @@
----
-title: PhishingMail — Forensics
-emoji: 🛡️
-colorFrom: red
-colorTo: yellow
-sdk: gradio
-sdk_version: "4.44.1"
-app_file: main.py
-pinned: false
----
-Phishing link analysis with on-screen forensics (tokens, logits, [CLS]) and JSON export.

+---
+title: PhishingMail-Lab
+emoji: 🧪
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+app_file: main.py
+python_version: 3.10
+pinned: false
+---
+# PhishingMail‑Lab (POC)
+A lightweight **POC** Space that extends your original project with **email+URL fusion** while staying Hugging Face free‑tier friendly.
+## What’s inside
+- Gradio UI
+- URL extraction + heuristic risk (demo)
+- Email classifier with **fallback loader** (MiniLM backbone if your HF checkpoint is missing)
+- Fusion & overrides (weights and τ are configurable)
+## Configure
+Set these in **Settings → Variables & secrets**:
+- `EMAIL_CLASSIFIER_ID` → your fine‑tuned MiniLM classifier on HF (e.g. `your-username/mini-phish`)
+- `EMAIL_BACKBONE_ID`  → defaults to `microsoft/MiniLM-L6-H384-uncased`
+- `THRESHOLD_TAU`      → default `0.40`
+## Run locally
+```bash
+pip install -r requirements.txt
+python main.py
+```
+Replace the heuristic URL scoring with your existing URL model + fusion logic when ready.

main.py CHANGED Viewed

@@ -1,335 +1,254 @@
-import os, re, time, json, urllib.parse
-import gradio as gr
-import torch
-import torch.nn.functional as F
-# Optional robust domain parsing; code falls back if missing.
-try:
-    import tldextract
-except Exception:
-    tldextract = None
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier"
-# Force readable labels regardless of model config
-ID2LABEL = {0: "benign", 1: "defacement", 2: "malware", 3: "phishing"}
-URL_RE = re.compile(r"""(?xi)\b(?:https?://|www\.)[^\s<>"'()]+""")
-KEYWORDS = {
-    "phish","login","verify","account","secure","update","bank","wallet",
-    "password","invoice","pay","reset","support","unlock","confirm"
-}
-SUSPICIOUS_TLDS = {
-    "zip","mov","lol","xyz","top","country","link","click","cam","help",
-    "gq","cf","tk","work","rest","monster","quest","live","io","ly"
-}
-URL_SHORTENERS = {
-    "bit.ly","tinyurl.com","t.co","goo.gl","is.gd","buff.ly","ow.ly","rebrand.ly","cutt.ly"
-}
-_tok = None
-_mdl = None
-# ---------- utils ----------
-def _extract_urls(text: str):
-    raw = [m.group(0).strip() for m in URL_RE.finditer(text or "")]
-    cleaned = []
-    for u in raw:
-        u = u.rstrip(").,;:!?•]}>\"'")
-        cleaned.append(u)
-    return sorted(set(cleaned))
-def _load_model():
-    global _tok, _mdl
-    if _tok is not None and _mdl is not None:
-        return _tok, _mdl
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    _tok = AutoTokenizer.from_pretrained(URL_MODEL_ID)
-    _mdl = AutoModelForSequenceClassification.from_pretrained(URL_MODEL_ID)
-    _mdl.eval()
-    return _tok, _mdl
-def _softmax(logits: torch.Tensor):
-    return F.softmax(logits, dim=-1).tolist()
-def _results_table(rows):
-    lines = [
-        "| URL | Model | Model Prob (%) | Heuristic | Fused Risk | Decision | Reasons |",
-        "|---|---|---:|---:|---:|:--:|---|",
-    ]
-    for r in rows:
-        u, lbl, pct, h, fused, decision, reasons = r
-        lines.append(
-            f"| `{u}` | **{lbl}** | {pct:.2f} | {h:.2f} | {fused:.2f} | {decision} | {reasons} |"
-        )
-    return "\n".join(lines)
-def _forensic_block(url, token_ids, tokens, scores_sorted, cls_vec, elapsed_s, truncated):
-    toks_prev = ", ".join(tokens[:64]) + (" …" if len(tokens) > 64 else "")
-    ids_prev  = ", ".join(map(str, token_ids[:64])) + (" …" if len(token_ids) > 64 else "")
-    cls_dim = len(cls_vec)
-    cls_prev = ", ".join(f"{v:.4f}" for v in cls_vec[:16]) + (" …" if cls_dim > 16 else "")
-    l2 = (sum(v*v for v in cls_vec)) ** 0.5
-    md = []
-    md.append(f"### 🔍 Forensics for `{url}`\n")
-    md.append(f"- tokens: **{len(tokens)}** • truncated: **{'yes' if truncated else 'no'}**")
-    md.append(f"- inference time: **{elapsed_s:.2f}s**\n")
-    md.append("**Top-k scores**")
-    md.append("| Class | Prob (%) | Logit |\n|---|---:|---:|")
-    for s in scores_sorted:
-        md.append(f"| **{s['label']}** | {s['prob']*100:.2f} | {s['logit']:.3f} |")
-    md.append("\n**Token IDs (preview)**")
-    md.append("```txt\n" + ids_prev + "\n```")
-    md.append("**Tokens (preview)**")
-    md.append("```txt\n" + toks_prev + "\n```")
-    md.append("**[CLS] embedding (preview)**")
-    md.append(f"`dim={cls_dim}`, `L2={l2:.4f}`")
-    md.append("```txt\n" + cls_prev + "\n```")
-    return "\n".join(md)
-# ---------- heuristics ----------
-def _safe_parse(url: str):
-    if not re.match(r"^https?://", url, re.I):
-        url = "http://" + url
-    return urllib.parse.urlparse(url)
-def _split_reg_domain(host: str):
-    parts = host.split(".")
-    if len(parts) >= 2:
-        return parts[-2] + "." + parts[-1]
-    return host
-def _domain_parts(host: str):
-    if tldextract:
-        ext = tldextract.extract(host)  # subdomain, domain, suffix
-        regdom = f"{ext.domain}.{ext.suffix}" if ext.domain and ext.suffix else host
-        sub = ext.subdomain or ""
-        tld = ext.suffix or ""
-        core = ext.domain or ""
-    else:
-        regdom = _split_reg_domain(host)
-        tld = regdom.split(".")[-1] if "." in regdom else ""
-        sub = host[:-len(regdom)].rstrip(".") if host.endswith(regdom) else ""
-        core = regdom.split(".")[0] if "." in regdom else regdom
-    return regdom, sub, core, tld
-def heuristic_features(u: str):
-    feats = {}
-    try:
-        p = _safe_parse(u)
-        feats["host"] = p.hostname or ""
-        feats["path"] = p.path or "/"
-        feats["query"] = p.query or ""
-        regdom, sub, core, tld = _domain_parts(feats["host"])
-        feats["registered_domain"] = regdom
-        feats["subdomain"] = sub
-        feats["tld"] = tld
-        feats["labels"] = feats["host"].count(".") + (1 if feats["host"] else 0)
-        feats["has_at"] = "@" in u
-        feats["has_port"] = bool(p.netloc and ":" in p.netloc.split("@")[-1])
-        feats["has_punycode"] = "xn--" in feats["host"]
-        feats["len_url"] = len(u)
-        feats["hyphen_in_regdom"] = "-" in (core or "")
-        low_host = feats["host"].lower()
-        low_path = feats["path"].lower()
-        feats["kw_in_path"] = int(any(k in low_path for k in KEYWORDS))
-        feats["kw_in_host"] = int(any(k in low_host for k in KEYWORDS))
-        feats["kw_in_subdomain_only"] = int(
-            feats["kw_in_host"] and (core and not any(k in (core.lower()) for k in KEYWORDS))
-        )
-        feats["suspicious_tld"] = int((feats["tld"].split(".")[-1] or "") in SUSPICIOUS_TLDS)
-        feats["is_shortener"] = int(regdom.lower() in URL_SHORTENERS)
-        alnum = sum(c.isalnum() for c in feats["query"])
-        feats["query_ratio_alnum"] = (alnum / max(1, len(feats["query"]))) if feats["query"] else 0.0
-        feats["parse_error"] = False
-    except Exception:
-        feats = {"parse_error": True}
-    return feats
-def heuristic_score(feats: dict) -> float:
-    if feats.get("parse_error"):
-        return 0.80
-    s = 0.0
-    s += 0.28 * feats["kw_in_path"]
-    s += 0.24 * feats["kw_in_subdomain_only"]
-    s += 0.10 * feats["kw_in_host"]
-    s += 0.12 * feats["hyphen_in_regdom"]
-    s += 0.10 * (feats["labels"] >= 4)
-    s += 0.10 * feats["has_punycode"]
-    s += 0.12 * feats["suspicious_tld"]
-    s += 0.10 * feats["is_shortener"]
-    s += 0.05 * feats["has_at"]
-    s += 0.05 * feats["has_port"]
-    s += 0.10 * (feats["len_url"] >= 100)
-    if feats.get("query") and len(feats.get("query", "")) >= 40 and feats.get("query_ratio_alnum", 0) > 0.9:
-        s += 0.10
-    return max(0.0, min(1.0, s))
-def heuristic_reasons(feats: dict) -> str:
-    if feats.get("parse_error"):
-        return "parse error"
-    rs = []
-    if feats.get("is_shortener"): rs.append("URL shortener")
-    if feats.get("kw_in_path"): rs.append("keyword in path")
-    if feats.get("kw_in_subdomain_only"): rs.append("keyword in subdomain")
-    if feats.get("kw_in_host") and not feats.get("kw_in_subdomain_only"): rs.append("keyword in host")
-    if feats.get("hyphen_in_regdom"): rs.append("hyphen in registered domain")
-    if feats.get("labels", 0) >= 4: rs.append("deep subdomain nesting")
-    if feats.get("has_punycode"): rs.append("punycode host")
-    if feats.get("suspicious_tld"): rs.append(f"suspicious TLD: {feats.get('tld')}")
-    if feats.get("has_at"): rs.append("@ in URL")
-    if feats.get("has_port"): rs.append("explicit port")
-    if feats.get("len_url", 0) >= 100: rs.append("very long URL")   # ✅ fixed
-    if feats.get("query") and len(feats.get("query", "")) >= 40 and feats.get("query_ratio_alnum", 0) > 0.9:
-        rs.append("long query blob")
-    return ", ".join(rs) if rs else "no heuristic triggers"
-def heuristic_hard_flag(feats: dict) -> (bool, str):
-    if feats.get("parse_error"):
-        return True, "unparsable URL"
-    if feats.get("kw_in_subdomain_only") and feats.get("kw_in_path"):
-        return True, "keyword in subdomain + keyword in path"
-    if feats.get("is_shortener") and (feats.get("kw_in_host") or feats.get("kw_in_path")):
-        return True, "URL shortener + keyword"
-    if feats.get("suspicious_tld") and (feats.get("kw_in_host") or feats.get("kw_in_path")):
-        return True, "suspicious TLD + keyword"
-    if feats.get("labels", 0) >= 4 and (feats.get("kw_in_host") or feats.get("kw_in_path")):
-        return True, "deep subdomain nesting + keyword"
-    return False, ""
-# ---------- core ----------
-def _parse_allowlist(s: str):
-    items = re.split(r"[,\s]+", (s or "").strip())
-    return {x.strip().lower() for x in items if x.strip()}
-def analyze(
-    text: str,
-    forensic: bool,
-    show_json: bool,
-    threshold: float,
-    allowlist_txt: str,
-    allowlist_override: bool
-):
-    """
-    One Markdown output:
-      - verdict + table (model, heuristic, fused + decision + reasons)
-      - optional forensic blocks
-      - optional raw JSON
-    """
-    text = (text or "").strip()
-    if not text:
-        return "Paste an email body or a URL."
-    urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text)
-    if not urls:
-        return "No URLs detected in the text."
-    allowset = _parse_allowlist(allowlist_txt)
-    tok, mdl = _load_model()
-    rows = []
-    forensic_blocks = []
-    export_data = {"model_id": URL_MODEL_ID, "items": []}
-    any_unsafe = False
-    for u in urls:
-        # model forward
-        max_len = min(512, getattr(mdl.config, "max_position_embeddings", 512) or 512)
-        enc = tok(u, truncation=True, max_length=max_len, return_tensors="pt", return_attention_mask=True)
-        token_ids = enc["input_ids"][0].tolist()
-        tokens = tok.convert_ids_to_tokens(enc["input_ids"][0])
-        truncated = enc["input_ids"].shape[1] >= max_len and len(tokens) >= max_len
-        t0 = time.time()
-        with torch.no_grad():
-            out = mdl(**enc, output_hidden_states=True)
-        elapsed = time.time() - t0
-        logits = out.logits.squeeze(0)
-        probs  = _softmax(logits)
-        scores = [{"label": ID2LABEL[i], "prob": float(probs[i]), "logit": float(logits[i])}
-                  for i in range(len(probs))]
-        scores_sorted = sorted(scores, key=lambda x: x["prob"], reverse=True)
-        top = scores_sorted[0]
-        # heuristics
-        feats = heuristic_features(u)
-        regdom = feats.get("registered_domain", "").lower()
-        h_flag, h_reason = heuristic_hard_flag(feats)
-        h_score = heuristic_score(feats)
-        mdl_phish_like = sum(s["prob"] for s in scores_sorted if s["label"] in {"phishing","malware","defacement"})
-        fused = 0.50 * mdl_phish_like + 0.50 * h_score
-        # allowlist override (domain-based)
-        allow_hit = regdom in allowset if regdom else False
-        decision = "🛑 UNSAFE"
-        reasons = (h_reason + (", " if h_reason else "") + heuristic_reasons(feats)).strip(", ")
-        if allow_hit and allowlist_override:
-            decision = "✅ SAFE"
-            reasons = f"allowlisted domain ({regdom})"
-            fused = min(fused, 0.01)  # clamp down the risk for display
-        else:
-            decision = "🛑 UNSAFE" if (h_flag or fused >= float(threshold)) else "✅ SAFE"
-        if decision.startswith("🛑"):
-            any_unsafe = True
-        rows.append([u, top["label"], top["prob"]*100.0, h_score, fused, decision, reasons])
-        # export + forensics
-        hidden_states = out.hidden_states
-        cls_vec = hidden_states[-1][0, 0, :].cpu().tolist()
-        export_data["items"].append({
-            "url": u, "token_ids": token_ids, "tokens": tokens, "truncated": truncated,
-            "logits": [float(x) for x in logits.cpu().tolist()], "probs": [float(p) for p in probs],
-            "scores_sorted": scores_sorted, "cls_vector": cls_vec, "cls_dim": len(cls_vec),
-            "elapsed_sec": elapsed, "heuristic": feats, "heuristic_score": h_score,
-            "fused_risk": fused, "hard_flag": h_flag, "hard_reason": h_reason,
-            "allowlisted": allow_hit
-        })
-        if forensic:
-            forensic_blocks.append(
-                _forensic_block(u, token_ids, tokens, scores_sorted, cls_vec, elapsed, truncated)
-            )
-    verdict = "🔴 **UNSAFE (at least one link flagged)**" if any_unsafe else "🟢 **SAFE (no link over threshold)**"
-    body = verdict + "\n\n" + _results_table(rows)
-    if forensic and forensic_blocks:
-        body += "\n\n---\n\n" + "\n\n---\n\n".join(forensic_blocks)
-    if show_json:
-        pretty = json.dumps(export_data, ensure_ascii=False, indent=2)
-        body += "\n\n---\n\n**Raw forensics JSON (copy & save):**\n"
-        body += "```json\n" + pretty + "\n```"
-    return body
-# ---------- UI ----------
-demo = gr.Interface(
-    fn=analyze,
-    inputs=[
-        gr.Textbox(lines=10, label="Email or URL", placeholder="Paste a URL or a full email…"),
-        gr.Checkbox(label="Forensic mode (tokens, logits, [CLS])", value=True),
-        gr.Checkbox(label="Show raw JSON at the end (copy/paste)", value=False),
-        gr.Slider(0.0, 1.0, value=0.40, step=0.01, label="Decision threshold (fused risk ≥ threshold → UNSAFE)"),
-        gr.Textbox(lines=2, label="Allowlist (domains, comma/space/newline separated)",
-                   placeholder="example.com, github.com  microsoft.com"),
-        gr.Checkbox(label="Allowlist overrides (force SAFE if registered domain matches)", value=True),
-    ],
-    outputs=gr.Markdown(label="Results"),
-    title="🛡️ PhishingMail — Model + Heuristics (HF Free CPU)",
-    description=(
-        "Extract links, score with a tiny HF URL model and transparent heuristics. "
-        "Short-circuits for classic phishing patterns. Adjust the threshold, and allowlist trusted domains."
-    ),
-)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

+import os, re, json
+from dataclasses import dataclass
+from typing import List, Dict, Tuple
+import gradio as gr
+# Optional imports for email classifier (loaded lazily)
+try:
+    import torch
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+except Exception:
+    torch = None
+    AutoTokenizer = None
+    AutoModelForSequenceClassification = None
+# =========================
+# Config (env-overridable)
+# =========================
+EMAIL_CLASSIFIER_ID = os.getenv("EMAIL_CLASSIFIER_ID", "your-username/mini-phish")  # swap to your HF model repo later
+EMAIL_BACKBONE_ID   = os.getenv("EMAIL_BACKBONE_ID", "microsoft/MiniLM-L6-H384-uncased")
+THRESHOLD_TAU       = float(os.getenv("THRESHOLD_TAU", "0.40"))
+MAX_SEQ_LEN         = int(os.getenv("MAX_SEQ_LEN", "320"))
+SUBJECT_TOKEN_BUDGET= int(os.getenv("SUBJECT_TOKEN_BUDGET", "64"))
+FUSION_EMAIL_W      = float(os.getenv("FUSION_EMAIL_W", "0.6"))
+FUSION_URL_W        = float(os.getenv("FUSION_URL_W", "0.4"))
+URL_OVERRIDE_HIGH   = float(os.getenv("URL_OVERRIDE_HIGH", "0.85"))
+URL_OVERRIDE_KW     = float(os.getenv("URL_OVERRIDE_KW", "0.70"))
+ALLOWLIST_SAFE_CAP  = float(os.getenv("ALLOWLIST_SAFE_CAP", "0.15"))
+# =========================
+# Simple data classes
+# =========================
+@dataclass
+class UrlResult:
+    url: str
+    risk: float
+    reasons: List[str]
+@dataclass
+class EmailResult:
+    p_email: float
+    kw_hits: List[str]
+# =========================
+# URL extraction & heuristics (replace with your existing pipeline)
+# =========================
+URL_REGEX = r'(?i)\b((?:https?://|www\.)[^\s<>")]+)'
+SUSPICIOUS_TLDS = {".xyz", ".top", ".click", ".link", ".ru", ".cn", ".country", ".gq", ".ga", ".ml", ".tk"}
+SHORTENERS = {"bit.ly","t.co","tinyurl.com","goo.gl","ow.ly","is.gd","cutt.ly","tiny.one","lnkd.in"}
+def extract_urls(text: str) -> List[str]:
+    if not text: return []
+    urls = re.findall(URL_REGEX, text)
+    # normalize
+    uniq = []
+    seen = set()
+    for u in urls:
+        u = u.strip().strip(').,;\'"')
+        if u and u not in seen:
+            uniq.append(u)
+            seen.add(u)
+    return uniq
+def url_host(url: str) -> str:
+    host = re.sub(r"^https?://", "", url, flags=re.I).split("/")[0].lower()
+    return host
+def score_url_heuristic(url: str) -> UrlResult:
+    host = url_host(url)
+    score = 0.05
+    reasons = []
+    if len(url) > 140:
+        score += 0.15; reasons.append("very_long_url")
+    if "@" in url or "%" in url:
+        score += 0.2; reasons.append("special_chars")
+    if any(host.endswith(t) for t in SUSPICIOUS_TLDS):
+        score += 0.35; reasons.append("suspicious_tld")
+    if any(s in host for s in SHORTENERS):
+        score += 0.5; reasons.append("shortener")
+    if host.count(".") >= 3:
+        score += 0.2; reasons.append("deep_subdomain")
+    if len(re.findall(r"[A-Z]", url)) > 16:
+        score += 0.1; reasons.append("mixed_case")
+    return UrlResult(url=url, risk=min(score, 1.0), reasons=reasons)
+def score_urls(urls: List[str]) -> List[UrlResult]:
+    return [score_url_heuristic(u) for u in urls]
+# =========================
+# Email classifier with fallback
+# =========================
+_tokenizer = None
+_model = None
+LEXICAL_CUES = [
+    "verify your account","update your password","immediately","within 24 hours",
+    "suspended","unusual activity","confirm","login","click","invoice","payment",
+    "otp","one-time password","unlock","reactivate","restricted","authenticate",
+    "security alert","urgent","limited time"
+]
+def load_email_model() -> Tuple[object, object]:
+    global _tokenizer, _model
+    if _tokenizer is not None and _model is not None:
+        return _tokenizer, _model
+    if AutoTokenizer is None or AutoModelForSequenceClassification is None or torch is None:
+        # environment without torch/transformers (Space will still boot)
+        return None, None
+    # Try the preferred classifier first
+    model_id = EMAIL_CLASSIFIER_ID
+    try:
+        _tokenizer = AutoTokenizer.from_pretrained(model_id)
+        _model = AutoModelForSequenceClassification.from_pretrained(model_id)
+    except Exception:
+        # Fallback: load backbone and attach a tiny random head
+        try:
+            _tokenizer = AutoTokenizer.from_pretrained(EMAIL_BACKBONE_ID)
+            _model = AutoModelForSequenceClassification.from_pretrained(
+                EMAIL_BACKBONE_ID, num_labels=2, problem_type="single_label_classification"
+            )
+        except Exception:
+            _tokenizer, _model = None, None
+            return None, None
+    # Dynamic quantization for CPU
+    try:
+        _model.eval()
+        _model.to("cpu")
+        if hasattr(torch, "quantization"):
+            from torch.quantization import quantize_dynamic
+            _model = quantize_dynamic(_model, {torch.nn.Linear}, dtype=torch.qint8)  # type: ignore
+    except Exception:
+        pass
+    return _tokenizer, _model
+def _truncate_for_budget(tokens_subject: List[int], tokens_body: List[int], max_len: int, subj_budget: int):
+    subj = tokens_subject[:subj_budget]
+    remain = max(0, max_len - len(subj))
+    body = tokens_body[:remain]
+    return subj + body
+def score_email(subject: str, body: str) -> EmailResult:
+    text = (subject or "") + "\n" + (body or "")
+    # lightweight lexical cues for reasons + kw_flag
+    hits = [c for c in LEXICAL_CUES if c in text.lower()]
+    tok, mdl = load_email_model()
+    if tok is None or mdl is None:
+        # fallback purely lexical probability
+        base = 0.15 + 0.1 * len(hits)
+        return EmailResult(p_email=float(min(base, 0.99)), kw_hits=hits)
+    # tokenize with budget
+    encoded_subj = tok.encode(subject or "", add_special_tokens=False)
+    encoded_body = tok.encode(body or "", add_special_tokens=False)
+    input_ids = _truncate_for_budget(encoded_subj, encoded_body, MAX_SEQ_LEN-2, SUBJECT_TOKEN_BUDGET)
+    input_ids = [tok.cls_token_id] + input_ids + [tok.sep_token_id]
+    attn_mask = [1]*len(input_ids)
+    import torch
+    ids = torch.tensor([input_ids], dtype=torch.long)
+    mask = torch.tensor([attn_mask], dtype=torch.long)
+    with torch.no_grad():
+        out = mdl(input_ids=ids, attention_mask=mask)
+        if hasattr(out, "logits"):
+            logits = out.logits[0].detach().cpu().numpy().tolist()
+            # assume label 1 = phishing (prob via softmax)
+            import math
+            exps = [math.exp(x) for x in logits]
+            p1 = exps[1] / (exps[0] + exps[1])
+            p_email = float(p1)
+        else:
+            p_email = 0.5
+    # small calibration nudge from lexical cues (kept light)
+    p_email = float(min(0.99, max(0.01, p_email + 0.03*len(hits))))
+    return EmailResult(p_email=p_email, kw_hits=hits)
+# =========================
+# Fusion
+# =========================
+def fuse(email_res: EmailResult, url_results: List[UrlResult], allowlist_domains: List[str]) -> Dict:
+    r_url_max = max([u.risk for u in url_results], default=0.0)
+    kw_flag = 1 if email_res.kw_hits else 0
+    # Allowlist check: if any URL host in allowlist
+    allowlist_hit = False
+    for u in url_results:
+        h = url_host(u.url)
+        if any(h.endswith(d.lower()) for d in allowlist_domains):
+            allowlist_hit = True
+            break
+    r_total = FUSION_EMAIL_W * email_res.p_email + FUSION_URL_W * r_url_max
+    if (r_url_max >= URL_OVERRIDE_HIGH) or (kw_flag and r_url_max >= URL_OVERRIDE_KW):
+        r_total = max(r_total, 0.90)
+    if allowlist_hit:
+        r_total = min(r_total, ALLOWLIST_SAFE_CAP)
+    verdict = "UNSAFE" if r_total >= THRESHOLD_TAU else "SAFE"
+    return {
+        "P_email": round(email_res.p_email, 3),
+        "R_url_max": round(r_url_max, 3),
+        "R_total": round(r_total, 3),
+        "kw_hits": email_res.kw_hits,
+        "allowlist_hit": allowlist_hit,
+        "verdict": verdict
+    }
+# =========================
+# Gradio UI
+# =========================
+with gr.Blocks(title="PhishingMail-Lab") as demo:
+    gr.Markdown("# 🧪 PhishingMail‑Lab\nFree‑tier friendly POC with email+URL fusion")
+    with gr.Row():
+        subject = gr.Textbox(label="Subject", placeholder="Subject: Important account update")
+    body = gr.Textbox(label="Email Body (paste text or HTML)", lines=10, placeholder="Paste the email content here...")
+    with gr.Row():
+        allowlist = gr.Textbox(label="Allowlist domains (comma-separated)", placeholder="microsoft.com, amazon.com")
+        tau = gr.Slider(0, 1, value=THRESHOLD_TAU, step=0.01, label="Decision Threshold τ")
+    analyze_btn = gr.Button("Analyze")
+    verdict = gr.Label(label="Verdict")
+    fusion_json = gr.JSON(label="Fusion & Flags")
+    url_table = gr.Dataframe(headers=["URL","Risk","Reasons"], label="Per‑URL risk (heuristics demo)", interactive=False)
+    def run(subject_text, body_text, allowlist_text, tau_val):
+        global THRESHOLD_TAU
+        THRESHOLD_TAU = float(tau_val)
+        urls = list(dict.fromkeys(extract_urls((subject_text or "") + "\n" + (body_text or ""))))  # uniq while preserving order
+        url_results = score_urls(urls)
+        allow_domains = [d.strip().lower() for d in (allowlist_text or "").split(",") if d.strip()]
+        email_res = score_email(subject_text or "", body_text or "")
+        fused = fuse(email_res, url_results, allow_domains)
+        rows = [[u.url, round(u.risk,3), ", ".join(u.reasons)] for u in url_results]
+        return fused["verdict"], fused, rows
+    analyze_btn.click(run, [subject, body, allowlist, tau], [verdict, fusion_json, url_table])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,8 +1,7 @@
-gradio==4.44.1
-transformers==4.55.2
-# optional but recommended; code falls back if missing
-tldextract==5.1.2
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.4.0+cpu

+gradio>=4.19,<5
+transformers>=4.41,<4.45
+torch>=2.2,<2.4
+tokenizers>=0.15,<0.20
+beautifulsoup4>=4.12,<5
+tldextract>=3.6,<4
+emoji>=2.10,<3