Outils pour utilisateurs

Outils du site


bash:extracteur_statusnet

Extracteur StatusNet

Usage: téléchargez, chmod +x, exécutez en Terminal.

  1. Demande instance et compte
  2. récupération page, affichage nombre de status, nombre de pages
  3. demande de confirmation
  4. création dossier “temppages”, wget des pages dedans
  5. “one-line” des pages, création dossier “tempstatus”
  6. extraction des statuts (1 fichier par statut)
  7. traitement des fichiers de statut
  8. sortie des statuts en “nom de compte.csv” (id,date,texte,client,contexte)
  9. suppression des dossiers temppages et tempstatus

Testé opérationnel avec:

  1. identi.ca/gatitac
  2. status.jbfavre.org
  3. status.ndn.cx/gordontesos
statusextract
#!/bin/bash
 
read -p "Instance et nom d'utilisateur (exemple: identi.ca/gatitac): " account
nombre=`wget -nv "http://$account" -O - | sed ':a;N;$!ba;s/\n/ /g' | awk -F '<dl class="entity_notices">' '{print $2}' | awk -F '<dd>' '{print $2}' | awk -F '<' '{print $1}'`;
echo "****************************"
echo "Nombre de statuts: $nombre"
pages=$(($nombre/20+1))
echo "Ça fait $pages pages à récupérer."
read -p "On continue chef ? ENTRÉE pour oui."
mkdir -p "temppages/"
cd "temppages/"
e=1
mpages=$(($pages+1))
while [ $e -lt $mpages ]
do wget -nv "http://$account?page=$e" -O "page$e.html"
e=$(($e + 1))
done
 
# on enlève ce qui est inutile (on garde juste la liste des statuts)
for file in *; do cat "$file" | sed ':a;N;$!ba;s/\n/ /g' | awk -F '<ol class="notices xoxo">' '{print $2}' | awk -F '</ol>' '{print $1}' >> "_$file"; rm "$file"; mv "_$file" "$file"; done
cd ..
mkdir -p "tempstatus/"
cd "temppages/"
echo "********************"
echo "Extraction des statuts"
echo "********************"
echo ""
for file in *; do
#traitement dent par dent
dent1=`cat "$file" | awk -F '<li' '{print $2}' | awk -F '</li>' '{print $1}'`
dent2=`cat "$file" | awk -F '<li' '{print $3}' | awk -F '</li>' '{print $1}'`
dent3=`cat "$file" | awk -F '<li' '{print $4}' | awk -F '</li>' '{print $1}'`
dent4=`cat "$file" | awk -F '<li' '{print $5}' | awk -F '</li>' '{print $1}'`
dent5=`cat "$file" | awk -F '<li' '{print $6}' | awk -F '</li>' '{print $1}'`
dent6=`cat "$file" | awk -F '<li' '{print $7}' | awk -F '</li>' '{print $1}'`
dent7=`cat "$file" | awk -F '<li' '{print $8}' | awk -F '</li>' '{print $1}'`
dent8=`cat "$file" | awk -F '<li' '{print $9}' | awk -F '</li>' '{print $1}'`
dent9=`cat "$file" | awk -F '<li' '{print $10}' | awk -F '</li>' '{print $1}'`
dent10=`cat "$file" | awk -F '<li' '{print $11}' | awk -F '</li>' '{print $1}'`
dent11=`cat "$file" | awk -F '<li' '{print $12}' | awk -F '</li>' '{print $1}'`
dent12=`cat "$file" | awk -F '<li' '{print $13}' | awk -F '</li>' '{print $1}'`
dent13=`cat "$file" | awk -F '<li' '{print $14}' | awk -F '</li>' '{print $1}'`
dent14=`cat "$file" | awk -F '<li' '{print $15}' | awk -F '</li>' '{print $1}'`
dent15=`cat "$file" | awk -F '<li' '{print $16}' | awk -F '</li>' '{print $1}'`
dent16=`cat "$file" | awk -F '<li' '{print $17}' | awk -F '</li>' '{print $1}'`
dent17=`cat "$file" | awk -F '<li' '{print $18}' | awk -F '</li>' '{print $1}'`
dent18=`cat "$file" | awk -F '<li' '{print $19}' | awk -F '</li>' '{print $1}'`
dent19=`cat "$file" | awk -F '<li' '{print $20}' | awk -F '</li>' '{print $1}'`
dent20=`cat "$file" | awk -F '<li' '{print $21}' | awk -F '</li>' '{print $1}'`
dentid1=`echo $dent1 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid2=`echo $dent2 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid3=`echo $dent3 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid4=`echo $dent4 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid5=`echo $dent5 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid6=`echo $dent6 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid7=`echo $dent7 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid8=`echo $dent8 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid9=`echo $dent9 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid10=`echo $dent10 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid11=`echo $dent11 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid12=`echo $dent12 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid13=`echo $dent13 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid14=`echo $dent14 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid15=`echo $dent15 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid16=`echo $dent16 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid17=`echo $dent17 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid18=`echo $dent18 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid19=`echo $dent19 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
dentid20=`echo $dent20 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'`
# détail: le printf sert à garder tout bien dans le bon ordre, jusqu'à 10 décimales
echo $dent1 >> "../tempstatus/`printf "%010d" $dentid1`"
echo $dent2 >> "../tempstatus/`printf "%010d" $dentid2`"
echo $dent3 >> "../tempstatus/`printf "%010d" $dentid3`"
echo $dent4 >> "../tempstatus/`printf "%010d" $dentid4`"
echo $dent5 >> "../tempstatus/`printf "%010d" $dentid5`"
echo $dent6 >> "../tempstatus/`printf "%010d" $dentid6`"
echo $dent7 >> "../tempstatus/`printf "%010d" $dentid7`"
echo $dent8 >> "../tempstatus/`printf "%010d" $dentid8`"
echo $dent9 >> "../tempstatus/`printf "%010d" $dentid9`"
echo $dent10 >> "../tempstatus/`printf "%010d" $dentid10`"
echo $dent11 >> "../tempstatus/`printf "%010d" $dentid11`"
echo $dent12 >> "../tempstatus/`printf "%010d" $dentid12`"
echo $dent13 >> "../tempstatus/`printf "%010d" $dentid13`"
echo $dent14 >> "../tempstatus/`printf "%010d" $dentid14`"
echo $dent15 >> "../tempstatus/`printf "%010d" $dentid15`"
echo $dent16 >> "../tempstatus/`printf "%010d" $dentid16`"
echo $dent17 >> "../tempstatus/`printf "%010d" $dentid17`"
echo $dent18 >> "../tempstatus/`printf "%010d" $dentid18`"
echo $dent19 >> "../tempstatus/`printf "%010d" $dentid19`"
echo $dent20 >> "../tempstatus/`printf "%010d" $dentid20`"
done
 
cd ..
cd "tempstatus/"
echo "********************"
echo "Traitement des statuts"
echo "********************"
echo ""
output=`echo $account | sed -e 's/\./-/g' | sed -e 's/\//_/g'`
	for file in *; do
	# dent texte
	denttexte=`cat "$file" | awk -F '<p class="entry-content">' '{print $2}' | awk -F '</p>' '{print $1}' | sed -e 's/<[^>]*>//g'`
	# dent date
	dentdate=`cat "$file" | awk -F '<abbr class="published" title="' '{print $2}' | awk -F '">' '{print $1}'`
	# dent device
	dentdevice=`cat "$file" | awk -F '<span class="device">' '{print $2}' | awk -F '</span>' '{print $1}' | sed -e 's/<[^>]*>//g' | sed 's/ //g'`
	# dent context
	instance=`echo $account | awk -F '/' '{print $1}'`
	dentcontexte=`cat "$file" | awk -F "$instance/conversation" '{print $2}' | awk -F '" class="response">' '{print $1}' | sed "s/\//$instance\/conversation\//"`
	echo "\"$file\",\"$dentdate\",\"$denttexte\",\"$dentdevice\",\"$dentcontexte\"" >> "../$output.csv"
	done
cd ..
echo "Nettoyage.."
rm -rf "tempstatus/"
rm -rf "temppages/"
bash/extracteur_statusnet.txt · Dernière modification : 2013-02-19 20:28 de 127.0.0.1