2.2: AI Requirements Analysis
Overview
AI-powered requirements analysis uses Natural Language Processing (NLP) and machine learning to automatically analyze, classify, and validate requirements. This section covers techniques for quality assessment, ambiguity detection, and completeness checking.
AI Analysis Architecture
The following diagram shows the AI requirements analysis pipeline, from natural language input through NLP processing, quality scoring, and compliance checking to produce actionable improvement recommendations.
NLP-Based Quality Analyzer
Implementation
Note: This implementation uses spaCy (tested with v3.5+) and requires the
en_core_web_lgmodel. Quality thresholds (e.g., severity weights, score penalties) are illustrative defaults and should be calibrated to your organization's standards.
"""
AI Requirements Quality Analyzer
NLP-powered requirements analysis for ASPICE compliance
"""
import spacy
from spacy.matcher import Matcher
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from enum import Enum
import re
class QualityDimension(Enum):
COMPLETENESS = "completeness"
CONSISTENCY = "consistency"
CLARITY = "clarity"
TESTABILITY = "testability"
ATOMICITY = "atomicity"
FEASIBILITY = "feasibility"
@dataclass
class QualityIssue:
"""Represents a quality issue in a requirement."""
dimension: QualityDimension
severity: str # high, medium, low
message: str
suggestion: str
position: Optional[Tuple[int, int]] = None
@dataclass
class AnalysisResult:
"""Complete analysis result for a requirement."""
requirement_id: str
text: str
quality_score: float
issues: List[QualityIssue]
classification: Dict[str, float]
metrics: Dict[str, any]
class RequirementsQualityAnalyzer:
"""AI-powered requirements quality analyzer."""
def __init__(self, model_name: str = "en_core_web_lg"):
self.nlp = spacy.load(model_name)
self.matcher = Matcher(self.nlp.vocab)
self._setup_patterns()
def _setup_patterns(self):
"""Setup patterns for issue detection."""
# Weak/ambiguous words pattern
weak_words = ["should", "could", "might", "may", "possibly",
"probably", "sometimes", "often", "usually",
"generally", "typically"]
weak_pattern = [{"LOWER": {"IN": weak_words}}]
self.matcher.add("WEAK_WORD", [weak_pattern])
# Vague quantifiers
vague_quant = ["some", "several", "many", "few", "various",
"numerous", "adequate", "sufficient", "appropriate"]
vague_pattern = [{"LOWER": {"IN": vague_quant}}]
self.matcher.add("VAGUE_QUANTIFIER", [vague_pattern])
# Passive voice (simplified)
passive_pattern = [
{"POS": "AUX", "LEMMA": "be"},
{"POS": "VERB", "TAG": "VBN"}
]
self.matcher.add("PASSIVE_VOICE", [passive_pattern])
# Multiple requirements indicator
conjunction_pattern = [
{"LOWER": {"IN": ["and", "or"]}},
{"POS": {"IN": ["VERB", "AUX"]}}
]
self.matcher.add("MULTIPLE_REQ", [conjunction_pattern])
def analyze(self, requirement_id: str, text: str) -> AnalysisResult:
"""Analyze a single requirement."""
doc = self.nlp(text)
issues = []
# Run all analyzers
issues.extend(self._check_ambiguity(doc))
issues.extend(self._check_completeness(doc, text))
issues.extend(self._check_testability(doc, text))
issues.extend(self._check_atomicity(doc, text))
issues.extend(self._check_clarity(doc, text))
# Calculate quality score
score = self._calculate_score(issues)
# Classify requirement
classification = self._classify_requirement(doc, text)
# Calculate metrics
metrics = self._calculate_metrics(doc, text)
return AnalysisResult(
requirement_id=requirement_id,
text=text,
quality_score=score,
issues=issues,
classification=classification,
metrics=metrics
)
def _check_ambiguity(self, doc) -> List[QualityIssue]:
"""Check for ambiguous language."""
issues = []
matches = self.matcher(doc)
for match_id, start, end in matches:
match_name = self.nlp.vocab.strings[match_id]
span = doc[start:end]
if match_name == "WEAK_WORD":
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="high",
message=f"Ambiguous word '{span.text}' found",
suggestion=f"Replace '{span.text}' with 'shall' for mandatory requirements",
position=(span.start_char, span.end_char)
))
elif match_name == "VAGUE_QUANTIFIER":
issues.append(QualityIssue(
dimension=QualityDimension.TESTABILITY,
severity="high",
message=f"Vague quantifier '{span.text}' found",
suggestion=f"Replace '{span.text}' with a specific measurable value",
position=(span.start_char, span.end_char)
))
elif match_name == "PASSIVE_VOICE":
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="medium",
message="Passive voice detected",
suggestion="Use active voice to clarify responsibility",
position=(span.start_char, span.end_char)
))
return issues
def _check_completeness(self, doc, text: str) -> List[QualityIssue]:
"""Check for completeness indicators."""
issues = []
# Check for 'shall' statement
if 'shall' not in text.lower():
issues.append(QualityIssue(
dimension=QualityDimension.COMPLETENESS,
severity="high",
message="Missing 'shall' statement",
suggestion="Requirements should contain 'shall' to indicate obligation"
))
# Check for subject (who/what)
has_subject = any(token.dep_ == "nsubj" for token in doc)
if not has_subject:
issues.append(QualityIssue(
dimension=QualityDimension.COMPLETENESS,
severity="medium",
message="Missing clear subject",
suggestion="Specify what system or component this requirement applies to"
))
# Check for action (verb)
has_verb = any(token.pos_ == "VERB" for token in doc)
if not has_verb:
issues.append(QualityIssue(
dimension=QualityDimension.COMPLETENESS,
severity="medium",
message="Missing action/verb",
suggestion="Specify what action the system shall perform"
))
# Check for TBD/TBC markers
tbd_patterns = ['tbd', 'tbc', 'to be determined', 'to be confirmed', 'xxx']
for pattern in tbd_patterns:
if pattern in text.lower():
issues.append(QualityIssue(
dimension=QualityDimension.COMPLETENESS,
severity="high",
message=f"Incomplete marker '{pattern.upper()}' found",
suggestion="Complete the requirement before approval"
))
return issues
def _check_testability(self, doc, text: str) -> List[QualityIssue]:
"""Check if requirement is testable."""
issues = []
# Check for measurable values
has_number = any(token.like_num for token in doc)
has_unit = any(token.text.lower() in ['ms', 'seconds', 's', 'bytes',
'kb', 'mb', '%', 'percent',
'hz', 'khz', 'mhz']
for token in doc)
# Performance/timing requirements should have values
performance_keywords = ['time', 'response', 'latency', 'throughput',
'speed', 'rate', 'frequency', 'interval']
has_performance_keyword = any(kw in text.lower() for kw in performance_keywords)
if has_performance_keyword and not (has_number or has_unit):
issues.append(QualityIssue(
dimension=QualityDimension.TESTABILITY,
severity="high",
message="Performance requirement without measurable value",
suggestion="Add specific numeric values with units (e.g., '< 100ms')"
))
# Check for subjective terms
subjective_terms = ['user-friendly', 'easy', 'simple', 'intuitive',
'fast', 'efficient', 'robust', 'flexible',
'scalable', 'reliable', 'secure']
for term in subjective_terms:
if term in text.lower():
issues.append(QualityIssue(
dimension=QualityDimension.TESTABILITY,
severity="medium",
message=f"Subjective term '{term}' is not testable",
suggestion=f"Define specific, measurable criteria for '{term}'"
))
return issues
def _check_atomicity(self, doc, text: str) -> List[QualityIssue]:
"""Check if requirement is atomic (single requirement)."""
issues = []
matches = self.matcher(doc)
# Count potential multiple requirements
multi_indicators = sum(1 for match_id, _, _ in matches
if self.nlp.vocab.strings[match_id] == "MULTIPLE_REQ")
# Check sentence count
sentences = list(doc.sents)
if len(sentences) > 2:
issues.append(QualityIssue(
dimension=QualityDimension.ATOMICITY,
severity="medium",
message=f"Requirement has {len(sentences)} sentences",
suggestion="Consider splitting into multiple atomic requirements"
))
# Check for multiple 'shall' statements
shall_count = text.lower().count('shall')
if shall_count > 1:
issues.append(QualityIssue(
dimension=QualityDimension.ATOMICITY,
severity="high",
message=f"Multiple 'shall' statements ({shall_count}) detected",
suggestion="Split into {shall_count} separate requirements"
))
return issues
def _check_clarity(self, doc, text: str) -> List[QualityIssue]:
"""Check requirement clarity."""
issues = []
# Word count check
word_count = len([token for token in doc if not token.is_punct])
if word_count < 5:
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="medium",
message=f"Requirement too short ({word_count} words)",
suggestion="Add more context and specificity"
))
if word_count > 60:
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="medium",
message=f"Requirement too long ({word_count} words)",
suggestion="Consider splitting into multiple requirements"
))
# Check for nested clauses (complexity)
nested_clauses = sum(1 for token in doc
if token.dep_ in ['relcl', 'advcl', 'ccomp'])
if nested_clauses > 2:
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="medium",
message=f"Complex sentence structure ({nested_clauses} nested clauses)",
suggestion="Simplify sentence structure for clarity"
))
# Check for abbreviations without definition
abbrev_pattern = r'\b[A-Z]{2,}\b'
abbreviations = re.findall(abbrev_pattern, text)
common_abbrevs = ['ID', 'API', 'UI', 'IO', 'USB', 'CAN', 'LIN', 'SPI']
for abbrev in abbreviations:
if abbrev not in common_abbrevs and f"({abbrev})" not in text:
issues.append(QualityIssue(
dimension=QualityDimension.CLARITY,
severity="low",
message=f"Abbreviation '{abbrev}' may need definition",
suggestion=f"Define '{abbrev}' or reference glossary"
))
return issues
def _calculate_score(self, issues: List[QualityIssue]) -> float:
"""Calculate quality score based on issues."""
if not issues:
return 100.0
# Weight by severity
severity_weights = {'high': 15, 'medium': 8, 'low': 3}
total_penalty = sum(
severity_weights.get(issue.severity, 5)
for issue in issues
)
score = max(0, 100 - total_penalty)
return round(score, 1)
def _classify_requirement(self, doc, text: str) -> Dict[str, float]:
"""Classify requirement type."""
scores = {
'functional': 0.0,
'performance': 0.0,
'interface': 0.0,
'safety': 0.0,
'security': 0.0,
'constraint': 0.0
}
text_lower = text.lower()
# Keyword-based classification
if any(w in text_lower for w in ['shall', 'function', 'perform', 'process', 'calculate']):
scores['functional'] += 0.5
if any(w in text_lower for w in ['time', 'latency', 'throughput', 'response', 'performance', 'speed']):
scores['performance'] += 0.6
if any(w in text_lower for w in ['interface', 'protocol', 'communication', 'signal', 'bus', 'can', 'lin']):
scores['interface'] += 0.6
if any(w in text_lower for w in ['safe', 'hazard', 'asil', 'fault', 'failure', 'diagnostic']):
scores['safety'] += 0.7
if any(w in text_lower for w in ['secure', 'encrypt', 'authenticate', 'authorize', 'attack']):
scores['security'] += 0.7
if any(w in text_lower for w in ['constraint', 'limit', 'maximum', 'minimum', 'not exceed']):
scores['constraint'] += 0.5
# Normalize
total = sum(scores.values())
if total > 0:
scores = {k: round(v / total, 2) for k, v in scores.items()}
return scores
def _calculate_metrics(self, doc, text: str) -> Dict[str, any]:
"""Calculate various metrics."""
return {
'word_count': len([t for t in doc if not t.is_punct]),
'sentence_count': len(list(doc.sents)),
'entity_count': len(doc.ents),
'has_shall': 'shall' in text.lower(),
'has_numeric': any(t.like_num for t in doc),
'complexity_score': sum(1 for t in doc if t.dep_ in ['relcl', 'advcl', 'ccomp'])
}
def analyze_batch(self, requirements: List[Dict]) -> Dict:
"""Analyze a batch of requirements."""
results = []
for req in requirements:
result = self.analyze(req['id'], req['text'])
results.append(result)
# Aggregate statistics
total = len(results)
passed = sum(1 for r in results if r.quality_score >= 80)
avg_score = sum(r.quality_score for r in results) / total if total > 0 else 0
# Issue frequency
issue_frequency = {}
for result in results:
for issue in result.issues:
key = (issue.dimension.value, issue.message)
issue_frequency[key] = issue_frequency.get(key, 0) + 1
return {
'total_requirements': total,
'passed_requirements': passed,
'failed_requirements': total - passed,
'average_score': round(avg_score, 1),
'pass_rate': round(passed / total * 100, 1) if total > 0 else 0,
'top_issues': sorted(issue_frequency.items(), key=lambda x: x[1], reverse=True)[:10],
'results': results
}
# Usage example
if __name__ == '__main__':
analyzer = RequirementsQualityAnalyzer()
# Sample requirements
requirements = [
{
'id': 'SWE-001',
'text': 'The door lock controller shall respond to lock/unlock commands within 100ms under normal operating conditions.'
},
{
'id': 'SWE-002',
'text': 'The system should be fast and user-friendly.'
},
{
'id': 'SWE-003',
'text': 'The controller shall lock the door and shall unlock the door and shall provide status indication.'
}
]
# Analyze
report = analyzer.analyze_batch(requirements)
print(f"Total: {report['total_requirements']}")
print(f"Passed: {report['passed_requirements']}")
print(f"Average Score: {report['average_score']}")
print("\nTop Issues:")
for (dimension, message), count in report['top_issues']:
print(f" [{dimension}] {message}: {count} occurrences")
Requirement Classification
ML-Based Classifier
Note: This classifier requires scikit-learn (tested with v1.3+). The training data shown is minimal for illustration; production use requires a substantially larger, domain-specific training dataset.
"""
ML-based Requirement Classifier
Classifies requirements by type, priority, and complexity
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from typing import List, Dict, Tuple
import numpy as np
class RequirementClassifier:
"""ML-based requirement classifier."""
def __init__(self):
self.type_classifier = None
self.priority_classifier = None
self.vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 3),
stop_words='english'
)
def train_type_classifier(self, requirements: List[str],
types: List[str]) -> Dict:
"""Train requirement type classifier."""
X_train, X_test, y_train, y_test = train_test_split(
requirements, types, test_size=0.2, random_state=42
)
self.type_classifier = Pipeline([
('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 3))),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
self.type_classifier.fit(X_train, y_train)
# Evaluate
predictions = self.type_classifier.predict(X_test)
report = classification_report(y_test, predictions, output_dict=True)
return {
'accuracy': report['accuracy'],
'report': report
}
def train_priority_classifier(self, requirements: List[str],
priorities: List[str]) -> Dict:
"""Train requirement priority classifier."""
X_train, X_test, y_train, y_test = train_test_split(
requirements, priorities, test_size=0.2, random_state=42
)
self.priority_classifier = Pipeline([
('tfidf', TfidfVectorizer(max_features=3000, ngram_range=(1, 2))),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
self.priority_classifier.fit(X_train, y_train)
predictions = self.priority_classifier.predict(X_test)
report = classification_report(y_test, predictions, output_dict=True)
return {
'accuracy': report['accuracy'],
'report': report
}
def predict_type(self, requirement: str) -> Tuple[str, float]:
"""Predict requirement type with confidence."""
if not self.type_classifier:
raise ValueError("Type classifier not trained")
prediction = self.type_classifier.predict([requirement])[0]
probabilities = self.type_classifier.predict_proba([requirement])[0]
confidence = max(probabilities)
return prediction, confidence
def predict_priority(self, requirement: str) -> Tuple[str, float]:
"""Predict requirement priority with confidence."""
if not self.priority_classifier:
raise ValueError("Priority classifier not trained")
prediction = self.priority_classifier.predict([requirement])[0]
probabilities = self.priority_classifier.predict_proba([requirement])[0]
confidence = max(probabilities)
return prediction, confidence
def classify(self, requirement: str) -> Dict:
"""Classify requirement type and priority."""
result = {}
if self.type_classifier:
req_type, type_conf = self.predict_type(requirement)
result['type'] = {'value': req_type, 'confidence': round(type_conf, 3)}
if self.priority_classifier:
priority, pri_conf = self.predict_priority(requirement)
result['priority'] = {'value': priority, 'confidence': round(pri_conf, 3)}
return result
def save_models(self, path: str):
"""Save trained models."""
models = {
'type_classifier': self.type_classifier,
'priority_classifier': self.priority_classifier
}
joblib.dump(models, path)
def load_models(self, path: str):
"""Load trained models."""
models = joblib.load(path)
self.type_classifier = models.get('type_classifier')
self.priority_classifier = models.get('priority_classifier')
class SimilarityDetector:
"""Detect similar and duplicate requirements."""
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=10000,
ngram_range=(1, 3)
)
self.requirement_vectors = None
self.requirement_ids = []
def fit(self, requirements: List[Dict]):
"""Fit vectorizer on requirements corpus."""
texts = [req['text'] for req in requirements]
self.requirement_ids = [req['id'] for req in requirements]
self.requirement_vectors = self.vectorizer.fit_transform(texts)
def find_similar(self, requirement_id: str,
threshold: float = 0.7) -> List[Dict]:
"""Find similar requirements."""
if requirement_id not in self.requirement_ids:
return []
idx = self.requirement_ids.index(requirement_id)
query_vector = self.requirement_vectors[idx]
# Calculate cosine similarity
similarities = (self.requirement_vectors * query_vector.T).toarray().flatten()
similar = []
for i, sim in enumerate(similarities):
if i != idx and sim >= threshold:
similar.append({
'id': self.requirement_ids[i],
'similarity': round(float(sim), 3)
})
return sorted(similar, key=lambda x: x['similarity'], reverse=True)
def find_duplicates(self, threshold: float = 0.95) -> List[Tuple]:
"""Find potential duplicate requirements."""
duplicates = []
# Calculate pairwise similarities
similarity_matrix = (self.requirement_vectors * self.requirement_vectors.T).toarray()
for i in range(len(self.requirement_ids)):
for j in range(i + 1, len(self.requirement_ids)):
if similarity_matrix[i, j] >= threshold:
duplicates.append((
self.requirement_ids[i],
self.requirement_ids[j],
round(float(similarity_matrix[i, j]), 3)
))
return sorted(duplicates, key=lambda x: x[2], reverse=True)
def find_conflicts(self, requirements: List[Dict]) -> List[Dict]:
"""Detect potential conflicting requirements."""
conflicts = []
# Simple rule-based conflict detection
conflict_patterns = [
('enable', 'disable'),
('shall', 'shall not'),
('allow', 'prevent'),
('maximum', 'minimum'),
('increase', 'decrease')
]
for i, req1 in enumerate(requirements):
for j, req2 in enumerate(requirements[i+1:], i+1):
# Check for conflicting keywords in similar context
text1 = req1['text'].lower()
text2 = req2['text'].lower()
for pattern1, pattern2 in conflict_patterns:
if (pattern1 in text1 and pattern2 in text2) or \
(pattern2 in text1 and pattern1 in text2):
# Check if they're about the same subject
sim = self._calculate_similarity(req1['id'], req2['id'])
if sim > 0.5:
conflicts.append({
'req1': req1['id'],
'req2': req2['id'],
'pattern': f"{pattern1} vs {pattern2}",
'similarity': sim,
'confidence': 'medium'
})
return conflicts
def _calculate_similarity(self, id1: str, id2: str) -> float:
"""Calculate similarity between two requirements."""
if id1 not in self.requirement_ids or id2 not in self.requirement_ids:
return 0.0
idx1 = self.requirement_ids.index(id1)
idx2 = self.requirement_ids.index(id2)
vec1 = self.requirement_vectors[idx1]
vec2 = self.requirement_vectors[idx2]
similarity = (vec1 * vec2.T).toarray()[0, 0]
return float(similarity)
# Usage
if __name__ == '__main__':
# Sample training data
training_requirements = [
("The system shall respond within 100ms", "performance"),
("The controller shall calculate the checksum", "functional"),
("The interface shall use CAN 2.0B protocol", "interface"),
("The system shall detect single-bit faults", "safety"),
("The controller shall encrypt all communications", "security"),
]
texts, types = zip(*training_requirements)
classifier = RequirementClassifier()
result = classifier.train_type_classifier(list(texts), list(types))
print(f"Type classifier accuracy: {result['accuracy']:.2%}")
# Classify new requirement
new_req = "The system shall transmit data every 50ms"
prediction = classifier.classify(new_req)
print(f"Predicted type: {prediction}")
Integration with RMS
CI/CD Pipeline
# GitHub Actions workflow for AI requirements analysis
name: Requirements Analysis
on:
push:
paths:
- 'requirements/**'
schedule:
- cron: '0 6 * * *' # Daily at 6 AM
jobs:
analyze-requirements:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install spacy scikit-learn pandas
python -m spacy download en_core_web_lg
- name: Export requirements from RMS
run: |
python scripts/export_requirements.py \
--server ${{ secrets.RMS_SERVER }} \
--project "BCM_Door_Lock" \
--output requirements.json
- name: Run AI Analysis
run: |
python scripts/ai_analysis.py \
--input requirements.json \
--output analysis_report.json \
--threshold 80
- name: Check Quality Gate
run: |
python scripts/check_quality_gate.py \
--report analysis_report.json \
--min-score 80 \
--max-high-issues 0
- name: Generate Report
run: |
python scripts/generate_report.py \
--input analysis_report.json \
--output analysis_report.html
- name: Upload Report
uses: actions/upload-artifact@v3
with:
name: requirements-analysis
path: analysis_report.html
- name: Notify on Issues
if: failure()
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
const report = JSON.parse(fs.readFileSync('analysis_report.json'));
let body = '## Requirements Quality Issues Found\n\n';
body += `**Score:** ${report.average_score}%\n`;
body += `**Failed:** ${report.failed_requirements} requirements\n\n`;
if (report.top_issues.length > 0) {
body += '### Top Issues\n';
for (const [[dim, msg], count] of report.top_issues.slice(0, 5)) {
body += `- [${dim}] ${msg}: ${count} occurrences\n`;
}
}
github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: 'Requirements Quality Issues',
body: body,
labels: ['requirements', 'quality']
});
Summary
AI Requirements Analysis:
- NLP Analysis: Quality, ambiguity, completeness checking
- ML Classification: Type, priority, complexity prediction
- Similarity Detection: Duplicates and conflicts
- CI/CD Integration: Automated quality gates
- HITL Pattern: AI assists, human decides
NLP Quality Metrics and Analysis
Quality Metrics Framework
ASPICE Quality Characteristics
| Characteristic | Score | Status |
|---|---|---|
| Unambiguous | 85% | Good |
| Verifiable | 75% | Acceptable |
| Consistent | 92% | Excellent |
| Complete | 65% | Needs Improvement |
| Traceable | 95% | Excellent |
| Modifiable | 60% | Needs Improvement |
| Singular | 70% | Acceptable |
| Feasible | 85% | Good |
NLP-Measured Indicators
| Indicator | Value | Target | Status |
|---|---|---|---|
| Readability Score (Flesch-Kincaid) | 75/100 | >60 | Pass |
| Complexity Index (Word/Sentence) | 3.5 | <5 | Pass |
| Ambiguity Index (Weak Words %) | 12% | <5% | Fail |
| Testability Score (Measurable Terms) | 70% | >80% | Fail |
Comprehensive Quality Analyzer
Implementation
"""
Comprehensive NLP Requirements Quality Analyzer
Multi-dimensional quality assessment with detailed metrics
"""
import spacy
import numpy as np
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass, field
from collections import Counter
import re
import textstat
@dataclass
class QualityMetrics:
"""Complete quality metrics for a requirement."""
# Readability
flesch_reading_ease: float
flesch_kincaid_grade: float
automated_readability_index: float
# Complexity
word_count: int
sentence_count: int
avg_words_per_sentence: float
avg_syllables_per_word: float
nested_clause_count: int
# Ambiguity
ambiguous_word_count: int
ambiguous_words: List[str]
ambiguity_percentage: float
# Testability
has_measurable_criteria: bool
measurable_terms: List[str]
testability_score: float
# Completeness
has_subject: bool
has_action: bool
has_object: bool
has_condition: bool
completeness_score: float
# Atomicity
shall_count: int
is_atomic: bool
atomicity_score: float
# Overall
overall_score: float
grade: str # A, B, C, D, F
issues: List[str] = field(default_factory=list)
recommendations: List[str] = field(default_factory=list)
class NLPQualityAnalyzer:
"""Advanced NLP-based requirements quality analyzer."""
def __init__(self, model_name: str = 'en_core_web_lg'):
self.nlp = spacy.load(model_name)
# Ambiguous/weak words
self.ambiguous_words = {
'should', 'could', 'might', 'may', 'possibly', 'probably',
'sometimes', 'often', 'usually', 'generally', 'typically',
'mostly', 'perhaps', 'presumably', 'approximately', 'about',
'around', 'nearly', 'almost', 'roughly', 'essentially'
}
# Vague quantifiers
self.vague_quantifiers = {
'some', 'several', 'many', 'few', 'various', 'numerous',
'adequate', 'sufficient', 'appropriate', 'reasonable',
'minimal', 'maximal', 'optimal', 'acceptable', 'significant'
}
# Measurable indicators
self.measurable_patterns = [
r'\d+\s*(ms|seconds?|minutes?|hours?|days?)',
r'\d+\s*(bytes?|kb|mb|gb|tb)',
r'\d+\s*(hz|khz|mhz|ghz)',
r'\d+\s*(%|percent)',
r'(less|more|greater|fewer)\s+than\s+\d+',
r'(within|under|over|above|below)\s+\d+',
r'(minimum|maximum|at least|at most|up to)\s+\d+',
r'\d+\s*to\s*\d+',
]
def analyze(self, requirement_text: str) -> QualityMetrics:
"""Perform comprehensive quality analysis."""
doc = self.nlp(requirement_text)
# Readability metrics
readability = self._analyze_readability(requirement_text)
# Complexity metrics
complexity = self._analyze_complexity(doc, requirement_text)
# Ambiguity metrics
ambiguity = self._analyze_ambiguity(doc, requirement_text)
# Testability metrics
testability = self._analyze_testability(doc, requirement_text)
# Completeness metrics
completeness = self._analyze_completeness(doc)
# Atomicity metrics
atomicity = self._analyze_atomicity(doc, requirement_text)
# Calculate overall score
overall = self._calculate_overall_score(
readability, complexity, ambiguity,
testability, completeness, atomicity
)
# Generate issues and recommendations
issues, recommendations = self._generate_feedback(
readability, complexity, ambiguity,
testability, completeness, atomicity
)
return QualityMetrics(
# Readability
flesch_reading_ease=readability['flesch_reading_ease'],
flesch_kincaid_grade=readability['flesch_kincaid_grade'],
automated_readability_index=readability['ari'],
# Complexity
word_count=complexity['word_count'],
sentence_count=complexity['sentence_count'],
avg_words_per_sentence=complexity['avg_words_per_sentence'],
avg_syllables_per_word=complexity['avg_syllables_per_word'],
nested_clause_count=complexity['nested_clauses'],
# Ambiguity
ambiguous_word_count=ambiguity['count'],
ambiguous_words=ambiguity['words'],
ambiguity_percentage=ambiguity['percentage'],
# Testability
has_measurable_criteria=testability['has_measurable'],
measurable_terms=testability['terms'],
testability_score=testability['score'],
# Completeness
has_subject=completeness['has_subject'],
has_action=completeness['has_action'],
has_object=completeness['has_object'],
has_condition=completeness['has_condition'],
completeness_score=completeness['score'],
# Atomicity
shall_count=atomicity['shall_count'],
is_atomic=atomicity['is_atomic'],
atomicity_score=atomicity['score'],
# Overall
overall_score=overall['score'],
grade=overall['grade'],
issues=issues,
recommendations=recommendations
)
def _analyze_readability(self, text: str) -> Dict:
"""Analyze text readability."""
return {
'flesch_reading_ease': textstat.flesch_reading_ease(text),
'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
'ari': textstat.automated_readability_index(text),
'coleman_liau': textstat.coleman_liau_index(text),
'reading_time': textstat.reading_time(text)
}
def _analyze_complexity(self, doc, text: str) -> Dict:
"""Analyze text complexity."""
words = [t for t in doc if not t.is_punct and not t.is_space]
sentences = list(doc.sents)
word_count = len(words)
sentence_count = max(1, len(sentences))
avg_words = word_count / sentence_count
# Count syllables
syllable_count = textstat.syllable_count(text)
avg_syllables = syllable_count / max(1, word_count)
# Nested clauses (relative, adverbial, complement)
nested_clauses = sum(
1 for t in doc
if t.dep_ in ['relcl', 'advcl', 'ccomp', 'xcomp']
)
return {
'word_count': word_count,
'sentence_count': sentence_count,
'avg_words_per_sentence': round(avg_words, 2),
'avg_syllables_per_word': round(avg_syllables, 2),
'nested_clauses': nested_clauses
}
def _analyze_ambiguity(self, doc, text: str) -> Dict:
"""Analyze text for ambiguity."""
text_lower = text.lower()
words = [t.text.lower() for t in doc if not t.is_punct]
# Find ambiguous words
found_ambiguous = []
for word in words:
if word in self.ambiguous_words or word in self.vague_quantifiers:
found_ambiguous.append(word)
count = len(found_ambiguous)
total_words = len(words)
percentage = (count / total_words * 100) if total_words > 0 else 0
return {
'count': count,
'words': list(set(found_ambiguous)),
'percentage': round(percentage, 2)
}
def _analyze_testability(self, doc, text: str) -> Dict:
"""Analyze testability of requirement."""
# Find measurable terms
measurable_terms = []
for pattern in self.measurable_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
measurable_terms.extend(matches)
has_measurable = len(measurable_terms) > 0
# Check for subjective terms
subjective_terms = {
'user-friendly', 'easy', 'simple', 'intuitive', 'fast',
'efficient', 'robust', 'flexible', 'scalable', 'reliable',
'secure', 'performant', 'responsive', 'stable', 'clean'
}
text_lower = text.lower()
has_subjective = any(term in text_lower for term in subjective_terms)
# Calculate score
base_score = 100
if not has_measurable:
base_score -= 30
if has_subjective:
base_score -= 20
# Check for specific verbs
testable_verbs = {'shall', 'must', 'will'}
has_testable_verb = any(t.text.lower() in testable_verbs for t in doc)
if not has_testable_verb:
base_score -= 15
return {
'has_measurable': has_measurable,
'terms': measurable_terms[:5],
'has_subjective': has_subjective,
'score': max(0, base_score)
}
def _analyze_completeness(self, doc) -> Dict:
"""Analyze completeness of requirement structure."""
has_subject = any(t.dep_ in ['nsubj', 'nsubjpass'] for t in doc)
has_action = any(t.pos_ == 'VERB' for t in doc)
has_object = any(t.dep_ in ['dobj', 'pobj', 'attr'] for t in doc)
has_condition = any(
t.dep_ == 'mark' and t.text.lower() in ['if', 'when', 'unless', 'while']
for t in doc
)
# Calculate score
components = [has_subject, has_action, has_object]
score = sum(1 for c in components if c) / len(components) * 100
return {
'has_subject': has_subject,
'has_action': has_action,
'has_object': has_object,
'has_condition': has_condition,
'score': round(score, 1)
}
def _analyze_atomicity(self, doc, text: str) -> Dict:
"""Analyze if requirement is atomic (single requirement)."""
text_lower = text.lower()
# Count 'shall' statements
shall_count = text_lower.count('shall')
# Check for multiple requirements indicators
sentences = list(doc.sents)
has_multiple_sentences = len(sentences) > 2
# Check for conjunctions splitting requirements
conjunction_splits = len(re.findall(
r'\bshall\b.*?\b(and|or)\b.*?\bshall\b',
text_lower
))
is_atomic = shall_count <= 1 and not has_multiple_sentences
# Calculate score
score = 100
if shall_count > 1:
score -= (shall_count - 1) * 25
if has_multiple_sentences:
score -= 15
if conjunction_splits > 0:
score -= conjunction_splits * 20
return {
'shall_count': shall_count,
'sentence_count': len(sentences),
'is_atomic': is_atomic,
'score': max(0, score)
}
def _calculate_overall_score(self, readability, complexity, ambiguity,
testability, completeness, atomicity) -> Dict:
"""Calculate overall quality score."""
# Weight factors
weights = {
'ambiguity': 0.25,
'testability': 0.25,
'completeness': 0.20,
'atomicity': 0.15,
'readability': 0.15
}
# Normalize readability (Flesch score 0-100)
readability_score = max(0, min(100, readability['flesch_reading_ease']))
# Ambiguity score (inverse - lower is better)
ambiguity_score = max(0, 100 - (ambiguity['percentage'] * 5))
# Calculate weighted score
score = (
ambiguity_score * weights['ambiguity'] +
testability['score'] * weights['testability'] +
completeness['score'] * weights['completeness'] +
atomicity['score'] * weights['atomicity'] +
readability_score * weights['readability']
)
# Determine grade
if score >= 90:
grade = 'A'
elif score >= 80:
grade = 'B'
elif score >= 70:
grade = 'C'
elif score >= 60:
grade = 'D'
else:
grade = 'F'
return {
'score': round(score, 1),
'grade': grade
}
def _generate_feedback(self, readability, complexity, ambiguity,
testability, completeness, atomicity) -> Tuple[List, List]:
"""Generate issues and recommendations."""
issues = []
recommendations = []
# Ambiguity issues
if ambiguity['count'] > 0:
issues.append(f"Contains {ambiguity['count']} ambiguous words: "
f"{', '.join(ambiguity['words'][:3])}")
recommendations.append("Replace ambiguous words with precise terms")
# Testability issues
if not testability['has_measurable']:
issues.append("No measurable criteria found")
recommendations.append("Add specific numeric values with units")
if testability.get('has_subjective', False):
issues.append("Contains subjective terms that are not testable")
recommendations.append("Define objective criteria for subjective terms")
# Completeness issues
if not completeness['has_subject']:
issues.append("Missing clear subject (what/who)")
recommendations.append("Specify the system or component name")
if not completeness['has_action']:
issues.append("Missing action (verb)")
recommendations.append("Add a clear action verb")
# Atomicity issues
if atomicity['shall_count'] > 1:
issues.append(f"Multiple requirements ({atomicity['shall_count']} 'shall' statements)")
recommendations.append("Split into separate atomic requirements")
# Readability issues
if readability['flesch_reading_ease'] < 30:
issues.append("Text is very difficult to read")
recommendations.append("Simplify sentence structure and word choice")
# Complexity issues
if complexity['avg_words_per_sentence'] > 25:
issues.append(f"Long sentences (avg {complexity['avg_words_per_sentence']} words)")
recommendations.append("Break long sentences into shorter ones")
return issues, recommendations
def analyze_batch(self, requirements: List[Dict]) -> Dict:
"""Analyze a batch of requirements and generate summary."""
results = []
grade_counts = Counter()
for req in requirements:
metrics = self.analyze(req['text'])
results.append({
'id': req['id'],
'text': req['text'][:100],
'metrics': metrics
})
grade_counts[metrics.grade] += 1
# Calculate summary statistics
total = len(results)
avg_score = sum(r['metrics'].overall_score for r in results) / total if total > 0 else 0
# Most common issues
all_issues = []
for r in results:
all_issues.extend(r['metrics'].issues)
issue_frequency = Counter(all_issues).most_common(5)
return {
'total_requirements': total,
'average_score': round(avg_score, 1),
'grade_distribution': dict(grade_counts),
'pass_rate': round((grade_counts['A'] + grade_counts['B']) / total * 100, 1) if total > 0 else 0,
'top_issues': issue_frequency,
'results': results
}
# Quality Dashboard Generator
class QualityDashboard:
"""Generate quality dashboards and reports."""
def __init__(self, analyzer: NLPQualityAnalyzer):
self.analyzer = analyzer
def generate_html_report(self, analysis_results: Dict) -> str:
"""Generate HTML quality report."""
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>Requirements Quality Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.summary {{ background: #f5f5f5; padding: 20px; border-radius: 5px; }}
.grade-A {{ color: #4CAF50; }}
.grade-B {{ color: #8BC34A; }}
.grade-C {{ color: #FFC107; }}
.grade-D {{ color: #FF9800; }}
.grade-F {{ color: #F44336; }}
table {{ border-collapse: collapse; width: 100%; margin-top: 20px; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background: #4CAF50; color: white; }}
.issue {{ color: #F44336; }}
.recommendation {{ color: #2196F3; }}
</style>
</head>
<body>
<h1>Requirements Quality Report</h1>
<div class="summary">
<h2>Summary</h2>
<p><strong>Total Requirements:</strong> {analysis_results['total_requirements']}</p>
<p><strong>Average Score:</strong> {analysis_results['average_score']}%</p>
<p><strong>Pass Rate (A/B):</strong> {analysis_results['pass_rate']}%</p>
<h3>Grade Distribution</h3>
<ul>
"""
for grade, count in sorted(analysis_results['grade_distribution'].items()):
html += f' <li class="grade-{grade}">Grade {grade}: {count}</li>\n'
html += """
</ul>
<h3>Top Issues</h3>
<ol>
"""
for issue, count in analysis_results['top_issues']:
html += f' <li class="issue">{issue} ({count} occurrences)</li>\n'
html += """
</ol>
</div>
<h2>Detailed Results</h2>
<table>
<tr>
<th>ID</th>
<th>Score</th>
<th>Grade</th>
<th>Issues</th>
<th>Recommendations</th>
</tr>
"""
for result in analysis_results['results']:
metrics = result['metrics']
issues_html = '<br>'.join(metrics.issues[:2]) if metrics.issues else 'None'
recs_html = '<br>'.join(metrics.recommendations[:2]) if metrics.recommendations else 'None'
html += f"""
<tr>
<td>{result['id']}</td>
<td>{metrics.overall_score}%</td>
<td class="grade-{metrics.grade}">{metrics.grade}</td>
<td class="issue">{issues_html}</td>
<td class="recommendation">{recs_html}</td>
</tr>
"""
html += """
</table>
</body>
</html>
"""
return html
def generate_json_report(self, analysis_results: Dict) -> Dict:
"""Generate JSON quality report for API consumption."""
return {
'summary': {
'total': analysis_results['total_requirements'],
'average_score': analysis_results['average_score'],
'pass_rate': analysis_results['pass_rate'],
'grades': analysis_results['grade_distribution']
},
'top_issues': [
{'issue': issue, 'count': count}
for issue, count in analysis_results['top_issues']
],
'requirements': [
{
'id': r['id'],
'score': r['metrics'].overall_score,
'grade': r['metrics'].grade,
'testability': r['metrics'].testability_score,
'completeness': r['metrics'].completeness_score,
'atomicity': r['metrics'].atomicity_score,
'issues': r['metrics'].issues,
'recommendations': r['metrics'].recommendations
}
for r in analysis_results['results']
]
}
# Usage
if __name__ == '__main__':
analyzer = NLPQualityAnalyzer()
# Sample requirements
requirements = [
{'id': 'SWE-001', 'text': 'The door lock controller shall respond to lock/unlock commands within 100ms under normal operating conditions.'},
{'id': 'SWE-002', 'text': 'The system should be fast and user-friendly.'},
{'id': 'SWE-003', 'text': 'The controller shall lock and shall unlock and shall provide status.'},
{'id': 'SWE-004', 'text': 'Response time approximately 50ms maybe.'},
]
# Analyze batch
results = analyzer.analyze_batch(requirements)
print(f"Average Score: {results['average_score']}%")
print(f"Pass Rate: {results['pass_rate']}%")
print(f"Grade Distribution: {results['grade_distribution']}")
# Generate report
dashboard = QualityDashboard(analyzer)
html_report = dashboard.generate_html_report(results)
with open('quality_report.html', 'w') as f:
f.write(html_report)
print("Report generated: quality_report.html")
Quality Trends Dashboard
"""
Requirements Quality Trends Dashboard
Track quality metrics over time
"""
from datetime import datetime, timedelta
from typing import List, Dict
import json
class QualityTrendTracker:
"""Track quality trends over time."""
def __init__(self, storage_path: str = 'quality_history.json'):
self.storage_path = storage_path
self.history = self._load_history()
def _load_history(self) -> List[Dict]:
"""Load historical data."""
try:
with open(self.storage_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
return []
def _save_history(self):
"""Save historical data."""
with open(self.storage_path, 'w') as f:
json.dump(self.history, f, default=str)
def record_analysis(self, project: str, analysis_results: Dict):
"""Record analysis results."""
record = {
'timestamp': datetime.now().isoformat(),
'project': project,
'total': analysis_results['total_requirements'],
'average_score': analysis_results['average_score'],
'pass_rate': analysis_results['pass_rate'],
'grades': analysis_results['grade_distribution'],
'top_issues': [
{'issue': i, 'count': c}
for i, c in analysis_results['top_issues']
]
}
self.history.append(record)
self._save_history()
def get_trends(self, project: str, days: int = 30) -> Dict:
"""Get quality trends for a project."""
cutoff = datetime.now() - timedelta(days=days)
relevant = [
h for h in self.history
if h['project'] == project and
datetime.fromisoformat(h['timestamp']) > cutoff
]
if not relevant:
return {'error': 'No data available'}
# Calculate trends
scores = [h['average_score'] for h in relevant]
pass_rates = [h['pass_rate'] for h in relevant]
score_trend = 'improving' if scores[-1] > scores[0] else 'declining'
pass_trend = 'improving' if pass_rates[-1] > pass_rates[0] else 'declining'
return {
'period': f'{days} days',
'data_points': len(relevant),
'score_trend': {
'direction': score_trend,
'start': scores[0],
'end': scores[-1],
'change': round(scores[-1] - scores[0], 2)
},
'pass_rate_trend': {
'direction': pass_trend,
'start': pass_rates[0],
'end': pass_rates[-1],
'change': round(pass_rates[-1] - pass_rates[0], 2)
},
'history': relevant
}
def get_issue_trends(self, project: str, days: int = 30) -> Dict:
"""Track which issues are improving or getting worse."""
cutoff = datetime.now() - timedelta(days=days)
relevant = [
h for h in self.history
if h['project'] == project and
datetime.fromisoformat(h['timestamp']) > cutoff
]
if len(relevant) < 2:
return {'error': 'Insufficient data'}
# Compare first and last period
first_issues = {i['issue']: i['count'] for i in relevant[0]['top_issues']}
last_issues = {i['issue']: i['count'] for i in relevant[-1]['top_issues']}
all_issues = set(first_issues.keys()) | set(last_issues.keys())
trends = []
for issue in all_issues:
first_count = first_issues.get(issue, 0)
last_count = last_issues.get(issue, 0)
change = last_count - first_count
trends.append({
'issue': issue,
'first_count': first_count,
'last_count': last_count,
'change': change,
'direction': 'improving' if change < 0 else 'worsening' if change > 0 else 'stable'
})
trends.sort(key=lambda x: x['change'])
return {
'improving': [t for t in trends if t['direction'] == 'improving'],
'worsening': [t for t in trends if t['direction'] == 'worsening'],
'stable': [t for t in trends if t['direction'] == 'stable']
}
CI/CD Integration
# Quality gate configuration
quality_gates:
requirements:
minimum_score: 75
minimum_pass_rate: 85
maximum_high_issues: 0
required_grades: ["A", "B", "C"]
blocked_grades: ["F"]
# GitHub Actions workflow
name: Requirements Quality Gate
on:
push:
paths:
- 'requirements/**'
jobs:
quality-analysis:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install spacy textstat
python -m spacy download en_core_web_lg
- name: Run Quality Analysis
run: |
python scripts/nlp_quality_analysis.py \
--input requirements/ \
--output quality_report.json \
--html quality_report.html
- name: Check Quality Gate
run: |
python scripts/check_quality_gate.py \
--report quality_report.json \
--config quality_gates.yaml
- name: Upload Reports
uses: actions/upload-artifact@v3
with:
name: quality-reports
path: |
quality_report.json
quality_report.html
- name: Comment on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
const report = JSON.parse(fs.readFileSync('quality_report.json'));
const body = `## Requirements Quality Analysis
| Metric | Value | Status |
|--------|-------|--------|
| Average Score | ${report.summary.average_score}% | ${report.summary.average_score >= 75 ? 'PASS' : 'FAIL'} |
| Pass Rate | ${report.summary.pass_rate}% | ${report.summary.pass_rate >= 85 ? 'PASS' : 'FAIL'} |
| Grade A | ${report.summary.grades.A || 0} | |
| Grade B | ${report.summary.grades.B || 0} | |
| Grade F | ${report.summary.grades.F || 0} | ${(report.summary.grades.F || 0) === 0 ? 'PASS' : 'FAIL'} |
`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body
});
Summary
NLP Quality Analysis:
- Comprehensive Metrics: Readability, complexity, ambiguity, testability
- Multi-Dimensional Scoring: Weighted overall score with grades
- Automated Feedback: Issues and recommendations
- Trend Tracking: Historical quality monitoring
- CI/CD Integration: Automated quality gates
Summary
AI Requirements Analysis:
- NLP Analysis: Ambiguity detection, quality scoring, readability metrics
- Quality Metrics: Multi-dimensional assessment (completeness, testability, atomicity)
- Automated Feedback: Issues identification and improvement recommendations
- Trend Tracking: Historical quality monitoring and improvement trends
- CI/CD Integration: Automated quality gates and continuous validation