Spaces:

marksverdhei
/

errant_gec

No application file

App Files Files Community

marksverdhei commited on 17 days ago

Commit

9b8727e

verified ·

1 Parent(s): a366cfb

Upload errant_gec.py with huggingface_hub

Browse files

Files changed (1) hide show

errant_gec.py +225 -0

errant_gec.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""ERRANT metric for Grammatical Error Correction evaluation.
+This metric uses the ERRANT (ERRor ANnotation Toolkit) to evaluate
+grammatical error correction systems by comparing edit operations
+between source, prediction, and reference sentences.
+"""
+import datasets
+import evaluate
+_CITATION = """\
+@inproceedings{bryant-etal-2017-automatic,
+    title = "Automatic Annotation and Evaluation of Error Types for Grammatical Error Correction",
+    author = "Bryant, Christopher  and
+      Felice, Mariano  and
+      Briscoe, Ted",
+    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = jul,
+    year = "2017",
+    address = "Vancouver, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/P17-1074",
+    doi = "10.18653/v1/P17-1074",
+    pages = "793--805",
+}
+"""
+_DESCRIPTION = """\
+ERRANT (ERRor ANnotation Toolkit) is a metric for evaluating grammatical error
+correction (GEC) systems. It computes precision, recall, and F-score by comparing
+the edit operations needed to transform source sentences into predictions versus
+the edit operations needed to transform source sentences into references.
+This metric requires three inputs:
+- sources: The original (uncorrected) sentences
+- predictions: The model's corrected sentences
+- references: The gold standard corrected sentences
+The metric extracts edits using the ERRANT library and computes:
+- Precision: What fraction of predicted edits are correct
+- Recall: What fraction of gold edits were predicted
+- F0.5: F-score with beta=0.5 (weighing precision twice as much as recall)
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    sources: list of source (original/uncorrected) sentences
+    predictions: list of predicted (corrected) sentences
+    references: list of reference (gold corrected) sentences
+    lang: language code for spaCy model (default: "en")
+        - "en": English (requires en_core_web_sm)
+        - "nb": Norwegian Bokmål (requires nb_core_news_sm)
+        - "de": German (requires de_core_news_sm)
+        - etc. (any language with a spaCy model)
+    beta: beta value for F-score calculation (default: 0.5)
+Returns:
+    precision: fraction of predicted edits that are correct
+    recall: fraction of gold edits that were predicted
+    f0.5: F-score with the specified beta value
+Examples:
+    >>> import evaluate
+    >>> errant_gec = evaluate.load("marksverdhei/errant_gec")
+    >>> results = errant_gec.compute(
+    ...     sources=["This are a sentence ."],
+    ...     predictions=["This is a sentence ."],
+    ...     references=["This is a sentence ."],
+    ...     lang="en"
+    ... )
+    >>> print(results)
+    {'precision': 1.0, 'recall': 1.0, 'f0.5': 1.0}
+"""
+# Map language codes to spaCy model names
+SPACY_MODELS = {
+    "en": "en_core_web_sm",
+    "nb": "nb_core_news_sm",
+    "nn": "nb_core_news_sm",  # Use Bokmål model for Nynorsk as fallback
+    "de": "de_core_news_sm",
+    "es": "es_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "ru": "ru_core_news_sm",
+    "zh": "zh_core_web_sm",
+    "ja": "ja_core_news_sm",
+}
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Errant(evaluate.Metric):
+    """ERRANT metric for grammatical error correction evaluation."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._annotators = {}  # Cache annotators per language
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "sources": datasets.Value("string"),
+                    "predictions": datasets.Value("string"),
+                    "references": datasets.Value("string"),
+                }
+            ),
+            reference_urls=["https://github.com/chrisjbryant/errant"],
+        )
+    def _get_annotator(self, lang: str):
+        """Get or create an ERRANT annotator for the specified language."""
+        if lang in self._annotators:
+            return self._annotators[lang]
+        import errant
+        import spacy
+        model_name = SPACY_MODELS.get(lang, f"{lang}_core_news_sm")
+        try:
+            nlp = spacy.load(model_name)
+        except OSError:
+            raise OSError(
+                f"spaCy model '{model_name}' not found. "
+                f"Please install it with: python -m spacy download {model_name}"
+            )
+        # ERRANT uses 'en' as base but we provide the spaCy model
+        # The language code is mainly used for tokenization rules
+        annotator = errant.load(lang if lang == "en" else "en", nlp)
+        self._annotators[lang] = annotator
+        return annotator
+    def _get_edits(self, annotator, orig_doc, cor_doc):
+        """Extract edits between original and corrected documents.
+        Returns a set of (o_start, o_end, o_str, c_str) tuples.
+        """
+        edits = annotator.annotate(orig_doc, cor_doc)
+        edit_set = set()
+        for edit in edits:
+            # Skip noop edits (no actual change)
+            if edit.o_str == edit.c_str:
+                continue
+            # Use span positions and strings as edit identifier
+            edit_set.add((edit.o_start, edit.o_end, edit.o_str, edit.c_str))
+        return edit_set
+    def _compute_fscore(self, tp: int, fp: int, fn: int, beta: float = 0.5) -> dict:
+        """Compute precision, recall, and F-score."""
+        precision = float(tp) / (tp + fp) if (tp + fp) > 0 else 1.0
+        recall = float(tp) / (tp + fn) if (tp + fn) > 0 else 1.0
+        if precision + recall > 0:
+            f_score = float((1 + beta**2) * precision * recall) / (
+                (beta**2 * precision) + recall
+            )
+        else:
+            f_score = 0.0
+        return {
+            "precision": precision,
+            "recall": recall,
+            f"f{beta}": f_score,
+        }
+    def _compute(
+        self,
+        sources: list[str],
+        predictions: list[str],
+        references: list[str],
+        lang: str = "en",
+        beta: float = 0.5,
+    ) -> dict:
+        """Compute ERRANT scores for the given inputs.
+        Args:
+            sources: Original (uncorrected) sentences
+            predictions: Model's corrected sentences
+            references: Gold standard corrected sentences
+            lang: Language code for spaCy model
+            beta: Beta value for F-score (default 0.5)
+        Returns:
+            Dictionary with precision, recall, and f{beta} scores
+        """
+        if not (len(sources) == len(predictions) == len(references)):
+            raise ValueError(
+                f"Inputs must have the same length. Got sources={len(sources)}, "
+                f"predictions={len(predictions)}, references={len(references)}"
+            )
+        annotator = self._get_annotator(lang)
+        total_tp = 0
+        total_fp = 0
+        total_fn = 0
+        for source, prediction, reference in zip(sources, predictions, references):
+            # Parse sentences
+            orig_doc = annotator.parse(source)
+            hyp_doc = annotator.parse(prediction)
+            ref_doc = annotator.parse(reference)
+            # Get edit sets
+            hyp_edits = self._get_edits(annotator, orig_doc, hyp_doc)
+            ref_edits = self._get_edits(annotator, orig_doc, ref_doc)
+            # Compute TP, FP, FN for this sample
+            tp = len(ref_edits & hyp_edits)
+            fp = len(hyp_edits - ref_edits)
+            fn = len(ref_edits - hyp_edits)
+            total_tp += tp
+            total_fp += fp
+            total_fn += fn
+        return self._compute_fscore(total_tp, total_fp, total_fn, beta=beta)