aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/guess-charset
blob: 04aa09bc54f13a0603e1c050be036fc036507c76 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#! /usr/bin/env python2
# SPDX-License-Identifier: GPL-2.0
# vim: set fileencoding=utf-8
# (c) Uwe Kleine-König <uwe@kleine-koenig.org>
# GPLv2

import locale
import sys

f = file(sys.argv[1])
data = f.read()

def len_utf8_char(data, i):
    def check_cont(num):
        if all(map(lambda c: ord(c) >= 0x80 and ord(c) <= 0xbf, data[i+1:i+num])):
            return num
        else:
            return -1
 
    if ord(data[i]) < 128:
        # ASCII char
        return 1
    elif ord(data[i]) & 0xe0 == 0xc0:
        return check_cont(2)
    elif ord(data[i]) & 0xf0 == 0xe0:
        return check_cont(3)
    elif ord(data[i]) & 0xf8 == 0xf0:
        return check_cont(4)
    elif ord(data[i]) & 0xfc == 0xf8:
        return check_cont(5)
    elif ord(data[i]) & 0xfe == 0xfc:
        return check_cont(6)

i = 0
maxl = 0
while i < len(data):
    l = len_utf8_char(data, i)
    if l < 0:
        prefenc = locale.getpreferredencoding()
        if prefenc not in ('UTF-8', 'ANSI_X3.4-1968'):
            print prefenc
        else:
            print 'ISO-8859-1'
        sys.exit(0)

    if maxl < l:
        maxl = l
    i += l

if maxl > 1:
    print 'UTF-8'
else:
    print 'ANSI_X3.4-1968'