Perl で utf8 文字列を byte サイズで split する

utf8 な文字列を特定のバイトサイズで切り分ける処理って Perl でどう書けば一番良いのかを長いこと考えてた(あまり困ってはなかった)んだけど、UTF-8文字列をバイト数でカットした時の末尾の処理見たら簡単に書けた。

#!/usr/bin/perl

use strict;
use warnings;
use Encode qw( is_utf8 decode _utf8_on );
require bytes;

my $utf8 =
    decode( 'euc-jp',
    "この日本語テキストは euc-jp で書かれていますが utf8 に変換されます" );

my @splited = byte_split( $utf8, 12 );

binmode STDOUT, ':utf8';
printf "utf8 string = %s\n", $utf8;

for my $str (@splited) {
    print $str, "\n";
    printf "\tutf8 length = %s\n", length $str;
    printf "\tbyte length = %s\n", bytes::length($str);
}

sub byte_split {
    my $str         = shift;
    my $byte_length = shift;

    return unless is_utf8($str);
    return unless $byte_length;
    return unless $byte_length > 2;
    my @result;
    my @strings = unpack "C*", $str;
    while (@strings) {
        my @spliced = splice @strings, 0, $byte_length;
        my( $round, $remainder ) = round_utf8( pack "C*", @spliced );
        _utf8_on($round);
        push @result, $round;

        if ($remainder) {
            my @remainder_list = unpack "C*", $remainder;
            @strings = ( @remainder_list, @strings );
        }
    }
    return @result;
}

sub round_utf8 {
    my $str = shift;
    my $remainder;

    if ( $str =~ /[\x00-\x7F]$/ ) {
        return $str;
    }
    if ( $str =~ s/([\xC0-\xFD])$// ) {
        $remainder = $1;
    }
    if ( $str =~ s/([\xE0-\xFD][\x80-\xBF])$// ) {
        $remainder = $1;
    }
    if ( $str =~ s/([\xF0-\xFD][\x80-\xBF]{2})$// ) {
        $remainder = $1;
    }
    # $str =~ s/[\xF8-\xFD][\x80-\xBF]{3}$//;  #4バイト余った場合
    # $str =~ s/[\xFC-\xFD][\x80-\xBF]{4}$//;  #5バイト余った場合
    return ( $str, $remainder );
}

実行するとこんな感じ

% perl test.pl
utf8 string = この日本語テキストは euc-jp で書かれていますが utf8 に変換されます
この日本
        utf8 length = 4
        byte length = 12
語テキス
        utf8 length = 4
        byte length = 12
トは euc-j
        utf8 length = 8
        byte length = 12
p で書か
        utf8 length = 5
        byte length = 11
れていま
        utf8 length = 4
        byte length = 12
すが utf8
        utf8 length = 8
        byte length = 12
に変換さ
        utf8 length = 4
        byte length = 12
れます
        utf8 length = 3
        byte length = 9

スクリプトを EUC-JP で書いたので中で utf8 に変換しています。unpack, pack を使っているので処理も速い(と思う)。Lingua::JA::Jtruncate は EUC, sjis, jis しか対象にしてないんだよなあ。他に CPAN module で似た処理をするやつってあるんですかね。