Search the web
Sign In
New User? Sign Up
perl-python
? Already a member? Sign in to Yahoo!

Yahoo! Groups Tips

Did you know...
Want to share photos of your group with the world? Add a group photo to Flickr.

Best of Y! Groups

   Check them out and nominate your group.
Having problems with message search? Fill out this form to ensure your group is one of the first to be migrated to the new message search system.

Messages

  Messages Help
Advanced
a script to check html size   Message List  
Reply | Forward Message #111 of 127 |
The following script takes a directory and print all the sizes of html
files, counting the sizes of inline images.

This script is useful in making sure that HTML file are under certain
size. This is useful because web visitors with slow connection may take
a long time to load html files with lots of inline images.

# -*- coding: utf-8 -*-
# Python


# Wed Oct 5 15:50:31 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org

import re, os.path, sys

inpath= '/Users/t/web/'

while inpath[-1] == '/': inpath = inpath[0:-1] # get rid of trailing
slash

if (not os.path.exists(inpath)):
print "dir " + inpath + " doesn't exist!"
sys.exit(1)

##################################################
# subroutines


def getInlineImg(file_full_path):
'''getInlineImg($file_full_path) returns a array that is a list of
inline images. For example, it may return ['xx.jpg','../image.png']'''
FF = open(file_full_path,'rb')
txt_segs = re.split( r'src', unicode(FF.read(),'utf-8'))
txt_segs.pop(0)
FF.close()
linx=[]
for linkBlock in txt_segs:
matchResult = re.search(r'\s*=\s*\"([^\"]+)\"', linkBlock)
if matchResult: linx.append( matchResult.group(1) )
return linx


def linkFullPath(dir,locallink):
'''linkFullPath(dir, locallink) returns a string that is the full
path to the local link. For example,
linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns
'Users/t/public_html/a/image/t.png'. The returned result will not
contain double slash or '../' string.'''
result = dir + '/' + locallink
result = re.sub(r'//+', r'/', result)
while re.search(r'/[^\/]+\/\.\.', result): result =
re.sub(r'/[^\/]+\/\.\.', '', result)
return result

def listInlineImg(htmlfile):
'''listInlineImg($html_file_full_path) returns a array where each
element is a full path to inline images in the html.'''
dir=os.path.dirname(htmlfile)
imgPaths = getInlineImg(htmlfile)
result = []
for aPath in imgPaths:
result.append(linkFullPath( dir, aPath))
return result


##################################################
# main

fileSizeList=[]
def checkLink(dummy, dirPath, fileList):
for fileName in fileList:
if '.html' == os.path.splitext(fileName)[1] and
os.path.isfile(dirPath+'/'+fileName):
totalSize = os.path.getsize(dirPath+'/'+fileName)
imagePathList = listInlineImg(dirPath+'/'+fileName)
for imgPath in imagePathList: totalSize +=
os.path.getsize(imgPath)
fileSizeList.append([totalSize, dirPath+'/'+fileName])


os.path.walk(inpath, checkLink, 'dummy')

fileSizeList.sort(key=lambda x:x[0],reverse=True)

for it in fileSizeList: print it
print "done reporting."


The following is a Perl version. The Python version above is a direct
translation of this Perl version.

# perl


# Tue Oct 4 14:36:48 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org

use Data::Dumper;
use File::Find;
use File::Basename;

$inpath = '/Users/t/web/';

while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing
slash

die "dir $inpath doesn't exist! $!" unless -e $inpath;

##################################################
# subroutines


# getInlineImg($file_full_path) returns a array that is a list of
inline images. For example, it may return ('xx.jpg','../image.png')
sub getInlineImg ($) { $full_file_path= $_[0];
@linx =(); open (FF, "<$full_file_path") or die "error: can not open
$full_file_path $!";
while (<FF>) { @txt_segs = split(m/src/, $_); shift @txt_segs;
for $linkBlock (@txt_segs) {
if ($linkBlock =~ m@\s*=\s*\"([^\"]+)\"@) { push @linx, $1; }
}
} close FF;
return @linx;
}


# linkFullPath($dir,$locallink) returns a string that is the full path
to the local link. For example,
linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns
'Users/t/public_html/a/image/t.png'. The returned result will not
contain double slash or '../' string.
sub linkFullPath($$){
$result=$_[0] . $_[1];
$result =~ s@\/+@\/@g;
while ($result =~ s@/[^\/]+\/\.\.@@g) {};
return $result;
}


# listInlineImg($html_file_full_path) returns a array where each
element is a full path to inline images in the html.
sub listInlineImg($) {
my $htmlfile= $_[0];

my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
my @imgPaths = getInlineImg($htmlfile);

my @result=();
foreach my $aPath (@imgPaths) { push @result,
linkFullPath($dir,$aPath);}
return @result;
}

##################################################
# main
sub checkLink {
if (
$File::Find::name =~ m@\.html$@ && -T $File::Find::name
) {
$totalSize= -s $File::Find::name;
@imagePathList = listInlineImg($File::Find::name);
for my $imgPath (@imagePathList) {$totalSize += -s $imgPath;};
push (@fileSizeList, [$totalSize, $File::Find::name]);
};
}

find(\&checkLink, $inpath);

@fileSizeList = sort { $b->[0] <=> $a->[0]} @fileSizeList;

print Dumper(\@fileSizeList);
print "done reporting.";

-------------
this post is archived at
http://xahlee.org/perl-python/check_html_size.html

☄



Fri Oct 7, 2005 12:06 pm

p0lyglut
Offline Offline
Send Email Send Email

Forward
Message #111 of 127 |
Expand Messages Author Sort by Date

The following script takes a directory and print all the sizes of html files, counting the sizes of inline images. This script is useful in making sure that...
xah lee
p0lyglut
Offline Send Email
Oct 7, 2005
12:13 pm
Advanced

Copyright © 2009 Yahoo! Inc. All rights reserved.
Privacy Policy - Terms of Service - Guidelines - Help