Source code for dk.identifiers.navn

# -*- coding: utf-8 -*-

"""Operations on names: shortening, normalizing, ...

   TODO: This module should be connected to the names database.

"""
import sys

import re


[docs]class Name2Long(ValueError): "The name is too long and cannot be shortened."
_mixedcase = re.compile(ur'[A-ZÆØÅ][a-zæøå]+[A-ZÆØÅ]') _mcmac = re.compile(ur'(?P<prefix>(Mc)|(Mac)|(Van)|(Von))(?P<sfx>.*)', re.IGNORECASE)
[docs]def normalize(fornavn, etternavn): """Normalize double spaces and title case. """ fname = u' '.join(fornavn.split()).title() lname = u' '.join(etternavn.split()) # if already mixed case, assume they know what they're doing if not _mixedcase.match(lname): m = _mcmac.match(lname) if m: g = m.groupdict() lname = g['prefix'].title() + g['sfx'].title() else: lname = lname.title() return fname, lname
[docs]def forkort_navn(lengde, fornavn, etternavn): "Shorten names and return the result as a string." fname, lname = shorten(lengde, fornavn, etternavn) return fname + u' ' + lname
[docs]def shorten_fname(length, fname): """Try to shorten the first-name `fname` to `length` characters (raise ValueError if we fail). Algorithm: Consider the last of the person's first names and replace it with its initial. If the resulting name is still too long, then call ourselves recursively. """ if len(fname) <= length: return fname firstnames = fname.split() for i in range(len(firstnames) - 1, 0, -1): if len(firstnames[i]) > 1: tmp = ' '.join(firstnames[:i] + [firstnames[i][0]] + firstnames[i + 1:]) return shorten_fname(length, tmp) raise Name2Long('%r kan ikke forkortes til %d bokstaver' % (fname, length))
[docs]def shorten_lname(length, lname): """Try to shorten the last-name `lname` to `length` characters (raise ValueError if we fail). Algorithm: Consider the first of the person's last names and replace it with its initial. If the resulting name is still too long, then call ourselves recursively. """ if len(lname) <= length: return lname lastnames = lname.split() for i in range(len(lastnames) - 1): if len(lastnames[i]) > 1: tmp = ' '.join(lastnames[:i] + [lastnames[i][0]] + lastnames[i + 1:]) return shorten_lname(length, tmp) raise Name2Long('%r kan ikke forkortes til %d bokstaver' % (lname, length))
[docs]def shorten(lengde, fornavn, etternavn): """Apply algorithms described above, first to first-names, then to last-names. Return tuple of (fornavn, etternavn). """ if len(fornavn) + len(etternavn) + 1 <= lengde: return (fornavn.lower().title(), etternavn.lower().title()) firstnames = fornavn.split() for i in range(len(firstnames) - 1, 0, -1): if len(firstnames[i]) > 1: tmp = ' '.join(firstnames[:i] + [firstnames[i][0]] + firstnames[i + 1:]) return shorten(lengde, tmp, etternavn) lastnames = etternavn.split() for i in range(len(lastnames) - 1): if len(lastnames[i]) > 1: tmp = ' '.join(lastnames[:i] + [lastnames[i][0]] + lastnames[i + 1:]) return shorten(lengde, fornavn, tmp) raise Name2Long('%r %r kan ikke forkortes til %d bokstaver' % ( fornavn, etternavn, lengde))
[docs]def combine(fornavn, etternavn): """Combine `fornavn` and `etternavn` with a space in-between. Remove any double spaces and fix capitalization. """ return u' '.join(shorten(sys.maxint, *normalize(fornavn, etternavn)))
# old code that works on utf-8 strings (should be removed..):
[docs]def normalize2uni(navn_u8): "Normalized := Unicode And Title Case." if type(navn_u8) != unicode: navn_u8 = navn_u8.decode('u8') normcase = navn_u8.lower().title() return normcase
[docs]def normalize2u8(navn): "Normalized := Utf-8 And Title Case." if type(navn) != unicode: navn = navn.decode('u8') normcase = navn.lower().title() return normcase.encode('u8')
[docs]def shorten_fname_u8(length, fname): return normalize2u8(shorten_fname(length, normalize2uni(fname)))
[docs]def shorten_lname_u8(length, lname): return normalize2u8(shorten_lname(length, normalize2uni(lname)))
[docs]def forkort_navn_u8(lengde, fornavn_u8, etternavn_u8): fname = normalize2uni(fornavn_u8) lname = normalize2uni(etternavn_u8) res = forkort_navn(lengde, fname, lname) return normalize2u8(res)
[docs]def shorten_u8(lengde, fornavn, etternavn): fname = normalize2uni(fornavn) lname = normalize2uni(etternavn) fname, lname = shorten(lengde, fname, lname) return normalize2u8(fname), normalize2u8(lname)
# -- end old code ---.
[docs]def test_module(): """ >>> forkort_navn(25, 'Bjorn Steinar', 'Fjeld Pettersen') u'Bjorn S Fjeld Pettersen' >>> forkort_navn(22, 'Bjorn Steinar', 'Fjeld Pettersen') u'Bjorn S F Pettersen' >>> forkort_navn(18, 'Bjorn Steinar', 'Fjeld Pettersen') Traceback (most recent call last): ... ValueError: Bjorn S F Pettersen kan ikke forkortes til 18 bokstaver >>> v = forkort_navn(25, u'Bjørn Øystein', u'Fjeld Pettersen') >>> u'Bjørn Ø Fjeld Pettersen' >>> v == u'Bjørn Ø Fjeld Pettersen' True >>> v = forkort_navn(21, u'Bjørn Øystein', u'Ødal Pettersen') >>> v == u'Bjørn Ø Ø Pettersen' True >>> v = forkort_navn_u8(21, u'Bjørn Øystein'.encode('u8'), u'Ødal Pettersen'.encode('u8')) >>> v == u'Bjørn Ø Ø Pettersen'.encode('u8') True Normalisering av navn til utf-8 med stor forbokstav som eneste store bokstav. >>> normalize2u8(u'Bjørn') == u'Bjørn'.encode('u8') True >>> normalize2u8(u'BJørn') == u'Bjørn'.encode('u8') True >>> normalize2u8(u'BJØrn'.encode('u8')) == u'Bjørn'.encode('u8') True >>> normalize2u8(u'geir-arne') == 'Geir-Arne' True """ import doctest doctest.testmod()
if __name__ == "__main__": test_module()