GLiNER
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np
class AddressGLiNER:
def __init__(self, model_name="bert-base-uncased"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=7 # O, B-STREET, I-STREET, B-HOUSE, I-HOUSE, B-SUITE, I-SUITE
)
# Define label mappings
self.id2label = {
0: "O",
1: "B-STREET",
2: "I-STREET",
3: "B-HOUSE",
4: "I-HOUSE",
5: "B-SUITE",
6: "I-SUITE"
}
self.label2id = {v: k for k, v in self.id2label.items()}
def train(self, train_data, epochs=5):
"""
Train the GLiNER model on address data
Args:
train_data: List of tuples (text, labels)
epochs: Number of training epochs
"""
self.model.train()
optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
for epoch in range(epochs):
total_loss = 0
for text, labels in train_data:
# Tokenize input text
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True
)
# Convert labels to tensor
label_ids = torch.tensor([self.label2id[label] for label in labels])
# Forward pass
outputs = self.model(**inputs, labels=label_ids)
loss = outputs.loss
# Backward pass
loss.backward()
optimizer.step()
optimizer.zero_grad()
total_loss += loss.item()
print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_data):.4f}")
def predict(self, text):
"""
Predict address components from input text
Args:
text: Input address string
Returns:
Dictionary containing extracted components
"""
self.model.eval()
# Tokenize input
inputs = self.tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True
)
# Get predictions
with torch.no_grad():
outputs = self.model(**inputs)
predictions = outputs.logits.argmax(-1)[0]
# Convert predictions to labels
predicted_labels = [self.id2label[p.item()] for p in predictions]
# Extract components
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
components = {
"street_name": "",
"house_number": "",
"suite_number": ""
}
current_label = None
current_text = ""
for token, label in zip(tokens, predicted_labels):
if label.startswith("B-"):
if current_label:
self._add_component(components, current_label, current_text.strip())
current_label = label[2:]
current_text = self.tokenizer.convert_tokens_to_string([token])
elif label.startswith("I-"):
current_text += " " + self.tokenizer.convert_tokens_to_string([token])
elif label == "O" and current_label:
self._add_component(components, current_label, current_text.strip())
current_label = None
current_text = ""
if current_label:
self._add_component(components, current_label, current_text.strip())
return components
def _add_component(self, components, label, text):
"""Helper method to add extracted components to the result dictionary"""
if label == "STREET":
components["street_name"] = text
elif label == "HOUSE":
components["house_number"] = text
elif label == "SUITE":
components["suite_number"] = text
# Example usage
def main():
# Initialize model
gliner = AddressGLiNER()
# Example training data
train_data = [
(
"123 Main Street Suite 45",
["B-HOUSE", "I-HOUSE", "B-STREET", "I-STREET", "B-SUITE", "I-SUITE"]
),
# Add more training examples here
]
# Train model
gliner.train(train_data)
# Example prediction
address = "456 Oak Avenue Suite 789"
result = gliner.predict(address)
print("\nExtracted components:")
print(f"Street Name: {result['street_name']}")
print(f"House Number: {result['house_number']}")
print(f"Suite Number: {result['suite_number']}")
if __name__ == "__main__":
main()Last updated