vcf_cleaner/vcf_cleaner.py

83 lines
3.2 KiB
Python
Raw Normal View History

2025-05-07 14:43:55 +02:00
import os
import sys
2025-05-07 22:15:09 +02:00
import quopri
2025-05-07 14:43:55 +02:00
import base64
2025-05-07 09:01:40 +02:00
import pickle
import phonenumbers
2025-05-07 09:01:40 +02:00
from pythonvCard4.vcard import Contact
2025-05-08 09:12:14 +02:00
# input should be a valid .vcf file, output in current dir
2025-05-07 22:15:09 +02:00
input_file = open(sys.argv[1], 'r', encoding="latin-1").readlines()
output = open('output.vcf', 'w')
2025-05-08 09:12:14 +02:00
# this only outputs pictures of contacts with multiples in a dir for manual check
# current code stage ignores the problem and chooses the first picture anyway
check_for_multiple_pictures = False
2025-05-07 09:01:40 +02:00
2025-05-08 09:12:14 +02:00
# a contact is defined by its END:VCARD markup
2025-05-07 10:00:29 +02:00
current_card = ""
2025-05-07 14:43:55 +02:00
for line in input_file:
2025-05-07 10:00:29 +02:00
current_card += line
2025-05-07 21:42:33 +02:00
2025-05-07 10:00:29 +02:00
if "END:VCARD" in line:
2025-05-08 09:12:14 +02:00
# the pythonvCard4 parsing assumes a LOT of things
2025-05-07 10:00:29 +02:00
contact = Contact.from_vcard(current_card)
2025-05-07 21:42:33 +02:00
2025-05-08 09:12:14 +02:00
# TODO : isolate contacts with multiple pictures
if check_for_multiple_pictures:
if "PHOTO" in contact.custom and len(contact.custom["PHOTO"]) > 1:
2025-05-07 21:42:33 +02:00
os.makedirs("multiple_pictures/" + contact.fn, exist_ok=True)
for image in range(len(contact.custom["PHOTO"])):
print(contact.custom["PHOTO"][image])
with open("photo/" + contact.fn + "/" + str(image) + ".jpg", "wb") as f:
f.write(base64.decodebytes(str.encode(contact.custom["PHOTO"][image])))
2025-05-07 09:01:40 +02:00
2025-05-08 09:12:14 +02:00
# choose the first pictures and ignores multiples
2025-05-07 21:42:33 +02:00
if "PHOTO" in contact.custom:
os.makedirs("pictures/" + contact.fn, exist_ok=True)
with open("pictures/" + contact.fn + "/profile.jpg", "wb") as f:
f.write(base64.decodebytes(str.encode(contact.custom["PHOTO"][0])))
contact.photo_path = "pictures/" + contact.fn + "/profile.jpg"
contact.custom = {}
2025-05-08 09:12:14 +02:00
# reformat phone numbers to international
# TODO : distinguish phone numbers in final vcard (hint : "type" ?!)
clean_tel = []
for number in contact.tel:
try:
number = number["value"]
number = number.replace("-", "")
number = phonenumbers.parse(number, region="FR")
number = phonenumbers.format_number(number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
2025-05-07 23:17:52 +02:00
if number not in clean_tel:
clean_tel.append(number)
except phonenumbers.phonenumberutil.NumberParseException:
continue
2025-05-07 23:17:52 +02:00
contact.tel = [{"value": x, "type": []} for x in clean_tel]
2025-05-08 09:12:14 +02:00
# handle the horrible quoted-printable string format
# TODO : doesn't work for list of strings in vobject
# full-name OK
2025-05-07 23:17:52 +02:00
contact.fn = quopri.decodestring(contact.fn).decode()
2025-05-08 09:12:14 +02:00
# name list NOT
2025-05-07 23:17:52 +02:00
clean_n = []
for name in contact.n:
clean = quopri.decodestring(name).decode()
clean_n.append(clean)
contact.name = clean_n
2025-05-08 09:12:14 +02:00
# nickname NOT USED
2025-05-07 23:17:52 +02:00
clean_nickname = []
for nick in contact.nickname:
nick = quopri.decodestring(nick).decode()
clean_n.append(nick)
contact.nickname = clean_nickname
2025-05-08 09:12:14 +02:00
# rewrite contact as vcard
# TODO : override the function that encode lists of strings
2025-05-07 22:15:09 +02:00
vcf_text = contact.to_vcard()
output.write(vcf_text)
2025-05-07 10:00:29 +02:00
current_card = ""